In [29]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from datetime import datetime, timedelta

# Load the data from both files
def load_and_normalize_data():
    # Load first dataset (task_schedule.json)
    with open('/home/sfischer/Documents/projects/wk_LinProg/LinProg_Scripts/gantt_output/task_schedule.json', 'r') as f:
        data1 = json.load(f)
    
    # Load second dataset (simulation results)
    with open('/home/sfischer/Documents/projects/wk_LinProg/simulation/results/log_raw/gantt_timing_combined_log_LinProg_opt_test_rep50_link576_Opt_NoC_1_Comp_0.1_scratch10000_comp1.json', 'r') as f:
        data2 = json.load(f)
    
    # Convert first dataset to DataFrame
    df1 = pd.DataFrame(data1)
    df1['Source'] = 'Task Schedule'
    df1['Task_ID'] = df1['Task'].str.extract(r'(\d+)').astype(int)
    
    # Convert second dataset to DataFrame
    df2 = pd.DataFrame(data2['tasks'])
    df2['Source'] = 'Simulation Results'
    df2['Task_ID'] = df2['task_id']
    df2['Task'] = 'Task ' + df2['task_id'].astype(str)
    df2['Start'] = df2['start_time']
    df2['End'] = df2['end_time']
    df2['Duration'] = df2['duration']
    
    # Normalize timing values to start from 0
    df1['Start_Normalized'] = df1['Start'] - df1['Start'].min()
    df1['End_Normalized'] = df1['End'] - df1['Start'].min()
    df1['Duration_Normalized'] = df1['Duration']
    
    df2['Start_Normalized'] = df2['Start'] - df2['Start'].min()
    df2['End_Normalized'] = df2['End'] - df2['Start'].min()
    df2['Duration_Normalized'] = df2['Duration']
    
    # Scale to same time range for comparison
    max_time_1 = df1['End_Normalized'].max()
    max_time_2 = df2['End_Normalized'].max()
    
    # Option 1: Scale both to 0-1 range
    df1['Start_Scaled'] = df1['Start_Normalized'] / max_time_1
    df1['End_Scaled'] = df1['End_Normalized'] / max_time_1
    df1['Duration_Scaled'] = df1['Duration_Normalized'] / max_time_1
    
    df2['Start_Scaled'] = df2['Start_Normalized'] / max_time_2
    df2['End_Scaled'] = df2['End_Normalized'] / max_time_2
    df2['Duration_Scaled'] = df2['Duration_Normalized'] / max_time_2
    
    return df1, df2

# Load the data
df1, df2 = load_and_normalize_data()

print("Dataset 1 (Task Schedule) Summary:")
print(f"Number of tasks: {len(df1)}")
print(f"Time range: {df1['Start'].min():.2f} - {df1['End'].max():.2f}")
print(f"Total duration: {df1['End'].max() - df1['Start'].min():.2f}")
print(f"Average task duration: {df1['Duration'].mean():.2f}")

print("\nDataset 2 (Simulation Results) Summary:")
print(f"Number of tasks: {len(df2)}")
print(f"Time range: {df2['Start'].min():.2f} - {df2['End'].max():.2f}")
print(f"Total duration: {df2['End'].max() - df2['Start'].min():.2f}")
print(f"Average task duration: {df2['Duration'].mean():.2f}")

# Display first few rows of each dataset
print("\nFirst 5 tasks from Dataset 1:")
print(df1[['Task', 'Start', 'End', 'Duration', 'Start_Normalized', 'End_Normalized']].head())

print("\nFirst 5 tasks from Dataset 2:")
print(df2[['Task', 'Start', 'End', 'Duration', 'Start_Normalized', 'End_Normalized']].head())

Dataset 1 (Task Schedule) Summary:
Number of tasks: 70
Time range: 0.00 - 375.00
Total duration: 375.00
Average task duration: 7.23

Dataset 2 (Simulation Results) Summary:
Number of tasks: 71
Time range: 101981.00 - 102202.00
Total duration: 221.00
Average task duration: 4.46

First 5 tasks from Dataset 1:
     Task  Start    End  Duration  Start_Normalized  End_Normalized
0  Task 0    0.0    1.0       1.0               0.0             1.0
1  Task 1    1.0    5.0       4.0               1.0             5.0
2  Task 2    5.0   12.0       7.0               5.0            12.0
3  Task 3   12.0  102.0      90.0              12.0           102.0
4  Task 4   97.0  102.0       5.0              97.0           102.0

First 5 tasks from Dataset 2:
      Task   Start     End  Duration  Start_Normalized  End_Normalized
0  Task 70  101981  101981         0                 0               0
1   Task 0  101981  101982         1                 0               1
2   Task 1  101982  101986         4   

In [30]:
def create_gantt_comparison():
    """Create side-by-side Gantt charts for comparison"""
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Task Schedule (Original)', 'Simulation Results'),
        vertical_spacing=0.1,
        specs=[[{"secondary_y": False}], [{"secondary_y": False}]]
    )
    
    # Colors for tasks
    colors1 = px.colors.qualitative.Set3
    colors2 = px.colors.qualitative.Pastel
    
    # Plot first dataset (normalized)
    for i, row in df1.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Normalized'], row['End_Normalized'], row['End_Normalized'], row['Start_Normalized'], row['Start_Normalized']],
                y=[row['Task_ID']-0.4, row['Task_ID']-0.4, row['Task_ID']+0.4, row['Task_ID']+0.4, row['Task_ID']-0.4],
                fill='toself',
                fillcolor=colors1[i % len(colors1)],
                line=dict(color='black', width=1),
                name=f"Task {row['Task_ID']}",
                text=f"Duration: {row['Duration']:.1f}",
                hovertemplate=f"<b>Task {row['Task_ID']}</b><br>" +
                             f"Start: {row['Start']:.1f}<br>" +
                             f"End: {row['End']:.1f}<br>" +
                             f"Duration: {row['Duration']:.1f}<br>" +
                             "<extra></extra>",
                showlegend=False
            ),
            row=1, col=1
        )
    
    # Plot second dataset (normalized)
    for i, row in df2.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Normalized'], row['End_Normalized'], row['End_Normalized'], row['Start_Normalized'], row['Start_Normalized']],
                y=[row['Task_ID']-0.4, row['Task_ID']-0.4, row['Task_ID']+0.4, row['Task_ID']+0.4, row['Task_ID']-0.4],
                fill='toself',
                fillcolor=colors2[i % len(colors2)],
                line=dict(color='black', width=1),
                name=f"Task {row['Task_ID']}",
                text=f"Duration: {row['Duration']:.1f}",
                hovertemplate=f"<b>Task {row['Task_ID']}</b><br>" +
                             f"Start: {row['Start']:.1f}<br>" +
                             f"End: {row['End']:.1f}<br>" +
                             f"Duration: {row['Duration']:.1f}<br>" +
                             f"Accelerator: {row.get('accelerator', 'N/A')}<br>" +
                             "<extra></extra>",
                showlegend=False
            ),
            row=2, col=1
        )
    
    # Update layout
    fig.update_layout(
        title="Gantt Chart Comparison (Normalized Time)",
        height=800,
        showlegend=False
    )
    
    fig.update_xaxes(title_text="Normalized Time", row=1, col=1)
    fig.update_xaxes(title_text="Normalized Time", row=2, col=1)
    fig.update_yaxes(title_text="Task ID", row=1, col=1)
    fig.update_yaxes(title_text="Task ID", row=2, col=1)
    
    return fig

def create_scaled_comparison():
    """Create comparison with both datasets scaled to 0-1 range"""
    
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Task Schedule (Scaled 0-1)', 'Simulation Results (Scaled 0-1)'),
        vertical_spacing=0.1
    )
    
    # Plot first dataset (scaled)
    for i, row in df1.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Scaled'], row['End_Scaled'], row['End_Scaled'], row['Start_Scaled'], row['Start_Scaled']],
                y=[row['Task_ID']-0.4, row['Task_ID']-0.4, row['Task_ID']+0.4, row['Task_ID']+0.4, row['Task_ID']-0.4],
                fill='toself',
                fillcolor=px.colors.qualitative.Set3[i % len(px.colors.qualitative.Set3)],
                line=dict(color='black', width=1),
                name=f"Task {row['Task_ID']}",
                hovertemplate=f"<b>Task {row['Task_ID']}</b><br>" +
                             f"Scaled Start: {row['Start_Scaled']:.3f}<br>" +
                             f"Scaled End: {row['End_Scaled']:.3f}<br>" +
                             f"Scaled Duration: {row['Duration_Scaled']:.3f}<br>" +
                             "<extra></extra>",
                showlegend=False
            ),
            row=1, col=1
        )
    
    # Plot second dataset (scaled)
    for i, row in df2.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Scaled'], row['End_Scaled'], row['End_Scaled'], row['Start_Scaled'], row['Start_Scaled']],
                y=[row['Task_ID']-0.4, row['Task_ID']-0.4, row['Task_ID']+0.4, row['Task_ID']+0.4, row['Task_ID']-0.4],
                fill='toself',
                fillcolor=px.colors.qualitative.Pastel[i % len(px.colors.qualitative.Pastel)],
                line=dict(color='black', width=1),
                name=f"Task {row['Task_ID']}",
                hovertemplate=f"<b>Task {row['Task_ID']}</b><br>" +
                             f"Scaled Start: {row['Start_Scaled']:.3f}<br>" +
                             f"Scaled End: {row['End_Scaled']:.3f}<br>" +
                             f"Scaled Duration: {row['Duration_Scaled']:.3f}<br>" +
                             f"Accelerator: {row.get('accelerator', 'N/A')}<br>" +
                             "<extra></extra>",
                showlegend=False
            ),
            row=2, col=1
        )
    
    fig.update_layout(
        title="Gantt Chart Comparison (Scaled to 0-1 Range)",
        height=800,
        showlegend=False
    )
    
    fig.update_xaxes(title_text="Scaled Time (0-1)", row=1, col=1)
    fig.update_xaxes(title_text="Scaled Time (0-1)", row=2, col=1)
    fig.update_yaxes(title_text="Task ID", row=1, col=1)
    fig.update_yaxes(title_text="Task ID", row=2, col=1)
    
    return fig

# Create and display the comparisons
gantt_fig = create_gantt_comparison()
gantt_fig.show()

scaled_fig = create_scaled_comparison()
scaled_fig.show()

In [31]:
def analyze_differences():
    """Analyze the differences between the two datasets"""
    
    # Merge datasets by Task_ID for comparison
    common_tasks = set(df1['Task_ID']) & set(df2['Task_ID'])
    
    comparison_data = []
    
    for task_id in common_tasks:
        task1 = df1[df1['Task_ID'] == task_id].iloc[0]
        task2 = df2[df2['Task_ID'] == task_id].iloc[0]
        
        comparison_data.append({
            'Task_ID': task_id,
            'Duration_1': task1['Duration'],
            'Duration_2': task2['Duration'],
            'Duration_Diff': task2['Duration'] - task1['Duration'],
            'Duration_Ratio': task2['Duration'] / task1['Duration'] if task1['Duration'] > 0 else float('inf'),
            'Start_1_Norm': task1['Start_Normalized'],
            'Start_2_Norm': task2['Start_Normalized'],
            'Start_Diff_Norm': task2['Start_Normalized'] - task1['Start_Normalized'],
            'Accelerator': task2.get('accelerator', 'N/A')
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    print("=== DETAILED COMPARISON ANALYSIS ===")
    print(f"Common tasks between datasets: {len(common_tasks)}")
    print(f"Tasks only in Dataset 1: {len(set(df1['Task_ID']) - set(df2['Task_ID']))}")
    print(f"Tasks only in Dataset 2: {len(set(df2['Task_ID']) - set(df1['Task_ID']))}")
    
    print("\n=== DURATION COMPARISON ===")
    print(f"Average duration difference: {comparison_df['Duration_Diff'].mean():.2f}")
    print(f"Median duration difference: {comparison_df['Duration_Diff'].median():.2f}")
    print(f"Max duration difference: {comparison_df['Duration_Diff'].max():.2f}")
    print(f"Min duration difference: {comparison_df['Duration_Diff'].min():.2f}")
    
    print("\n=== START TIME COMPARISON (Normalized) ===")
    print(f"Average start time difference: {comparison_df['Start_Diff_Norm'].mean():.2f}")
    print(f"Median start time difference: {comparison_df['Start_Diff_Norm'].median():.2f}")
    
    print("\n=== TASKS WITH LARGEST DIFFERENCES ===")
    print("Top 10 tasks with largest duration differences:")
    top_diff = comparison_df.nlargest(10, 'Duration_Diff')[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff', 'Accelerator']]
    print(top_diff.to_string(index=False))
    
    print("\nTop 10 tasks with largest duration ratios:")
    top_ratio = comparison_df.nlargest(10, 'Duration_Ratio')[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Ratio', 'Accelerator']]
    print(top_ratio.to_string(index=False))
    
    return comparison_df

def create_difference_plots(comparison_df):
    """Create plots showing the differences between datasets"""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Duration Comparison', 'Duration Differences', 'Start Time Comparison', 'Duration Ratio Distribution'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Duration comparison scatter plot
    fig.add_trace(
        go.Scatter(
            x=comparison_df['Duration_1'],
            y=comparison_df['Duration_2'],
            mode='markers',
            name='Duration Comparison',
            text=comparison_df['Task_ID'],
            hovertemplate='<b>Task %{text}</b><br>' +
                         'Dataset 1: %{x:.1f}<br>' +
                         'Dataset 2: %{y:.1f}<br>' +
                         '<extra></extra>',
            marker=dict(size=8, color='blue', opacity=0.6)
        ),
        row=1, col=1
    )
    
    # Add diagonal line for reference
    max_duration = max(comparison_df['Duration_1'].max(), comparison_df['Duration_2'].max())
    fig.add_trace(
        go.Scatter(
            x=[0, max_duration],
            y=[0, max_duration],
            mode='lines',
            name='Equal Duration',
            line=dict(dash='dash', color='red'),
            showlegend=False
        ),
        row=1, col=1
    )
    
    # Duration differences
    fig.add_trace(
        go.Bar(
            x=comparison_df['Task_ID'],
            y=comparison_df['Duration_Diff'],
            name='Duration Difference',
            marker=dict(color=np.where(comparison_df['Duration_Diff'] > 0, 'green', 'red'))
        ),
        row=1, col=2
    )
    
    # Start time comparison
    fig.add_trace(
        go.Scatter(
            x=comparison_df['Start_1_Norm'],
            y=comparison_df['Start_2_Norm'],
            mode='markers',
            name='Start Time Comparison',
            text=comparison_df['Task_ID'],
            hovertemplate='<b>Task %{text}</b><br>' +
                         'Dataset 1: %{x:.1f}<br>' +
                         'Dataset 2: %{y:.1f}<br>' +
                         '<extra></extra>',
            marker=dict(size=8, color='orange', opacity=0.6)
        ),
        row=2, col=1
    )
    
    # Duration ratio histogram
    fig.add_trace(
        go.Histogram(
            x=comparison_df['Duration_Ratio'],
            nbinsx=20,
            name='Duration Ratio Distribution',
            marker=dict(color='purple', opacity=0.7)
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title="Detailed Comparison Analysis",
        height=800,
        showlegend=False
    )
    
    fig.update_xaxes(title_text="Dataset 1 Duration", row=1, col=1)
    fig.update_yaxes(title_text="Dataset 2 Duration", row=1, col=1)
    fig.update_xaxes(title_text="Task ID", row=1, col=2)
    fig.update_yaxes(title_text="Duration Difference", row=1, col=2)
    fig.update_xaxes(title_text="Dataset 1 Start Time", row=2, col=1)
    fig.update_yaxes(title_text="Dataset 2 Start Time", row=2, col=1)
    fig.update_xaxes(title_text="Duration Ratio", row=2, col=2)
    fig.update_yaxes(title_text="Frequency", row=2, col=2)
    
    return fig

# Perform the analysis
comparison_df = analyze_differences()

# Create and display difference plots
diff_fig = create_difference_plots(comparison_df)
diff_fig.show()

=== DETAILED COMPARISON ANALYSIS ===
Common tasks between datasets: 70
Tasks only in Dataset 1: 0
Tasks only in Dataset 2: 1

=== DURATION COMPARISON ===
Average duration difference: -2.70
Median duration difference: 0.00
Max duration difference: 0.00
Min duration difference: -53.00

=== START TIME COMPARISON (Normalized) ===
Average start time difference: -104.69
Median start time difference: -114.00

=== TASKS WITH LARGEST DIFFERENCES ===
Top 10 tasks with largest duration differences:
 Task_ID  Duration_1  Duration_2  Duration_Diff  Accelerator
       0         1.0           1            0.0           10
       1         4.0           4            0.0           10
       2         7.0           7            0.0           10
       4         5.0           5            0.0            6
       5         0.0           0            0.0            6
       7         5.0           5            0.0            3
       8         1.0           1            0.0            7
       9         3.

In [32]:
def create_summary_table():
    """Create a comprehensive summary table"""
    
    # Calculate statistics for both datasets
    stats_data = {
        'Metric': [
            'Total Tasks',
            'Total Execution Time',
            'Average Task Duration',
            'Median Task Duration',
            'Min Task Duration',
            'Max Task Duration',
            'Tasks with Zero Duration',
            'Task Duration Std Dev'
        ],
        'Task Schedule': [
            len(df1),
            f"{df1['End'].max() - df1['Start'].min():.2f}",
            f"{df1['Duration'].mean():.2f}",
            f"{df1['Duration'].median():.2f}",
            f"{df1['Duration'].min():.2f}",
            f"{df1['Duration'].max():.2f}",
            len(df1[df1['Duration'] == 0]),
            f"{df1['Duration'].std():.2f}"
        ],
        'Simulation Results': [
            len(df2),
            f"{df2['End'].max() - df2['Start'].min():.2f}",
            f"{df2['Duration'].mean():.2f}",
            f"{df2['Duration'].median():.2f}",
            f"{df2['Duration'].min():.2f}",
            f"{df2['Duration'].max():.2f}",
            len(df2[df2['Duration'] == 0]),
            f"{df2['Duration'].std():.2f}"
        ]
    }
    
    stats_df = pd.DataFrame(stats_data)
    
    print("=== COMPREHENSIVE SUMMARY TABLE ===")
    print(stats_df.to_string(index=False))
    
    # Task execution pattern analysis
    print("\n=== TASK EXECUTION PATTERN ANALYSIS ===")
    
    # Check for parallelism in both datasets
    def analyze_parallelism(df, name):
        overlapping_tasks = 0
        total_comparisons = 0
        
        for i in range(len(df)):
            for j in range(i+1, len(df)):
                task1 = df.iloc[i]
                task2 = df.iloc[j]
                
                # Check if tasks overlap
                if (task1['Start'] < task2['End'] and task2['Start'] < task1['End']):
                    overlapping_tasks += 1
                total_comparisons += 1
        
        parallelism_ratio = overlapping_tasks / total_comparisons if total_comparisons > 0 else 0
        
        print(f"{name}:")
        print(f"  - Overlapping task pairs: {overlapping_tasks}")
        print(f"  - Total task pairs: {total_comparisons}")
        print(f"  - Parallelism ratio: {parallelism_ratio:.3f}")
        
        return parallelism_ratio
    
    para1 = analyze_parallelism(df1, "Task Schedule")
    para2 = analyze_parallelism(df2, "Simulation Results")
    
    # Accelerator usage analysis (only for simulation results)
    if 'accelerator' in df2.columns:
        print(f"\n=== ACCELERATOR USAGE (Simulation Results) ===")
        accel_usage = df2['accelerator'].value_counts().sort_index()
        print("Tasks per accelerator:")
        for accel, count in accel_usage.items():
            print(f"  Accelerator {accel}: {count} tasks")
    
    return stats_df

def create_timeline_overlay():
    """Create an overlay comparison showing both datasets on the same timeline"""
    
    fig = go.Figure()
    
    # Add Dataset 1 tasks (normalized)
    for i, row in df1.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Normalized'], row['End_Normalized'], row['End_Normalized'], row['Start_Normalized'], row['Start_Normalized']],
                y=[row['Task_ID']-0.35, row['Task_ID']-0.35, row['Task_ID']-0.05, row['Task_ID']-0.05, row['Task_ID']-0.35],
                fill='toself',
                fillcolor='rgba(0, 100, 255, 0.6)',
                line=dict(color='blue', width=1),
                name=f"Schedule Task {row['Task_ID']}",
                legendgroup="schedule",
                showlegend=i==0,
                hovertemplate=f"<b>Schedule Task {row['Task_ID']}</b><br>" +
                             f"Start: {row['Start']:.1f}<br>" +
                             f"End: {row['End']:.1f}<br>" +
                             f"Duration: {row['Duration']:.1f}<br>" +
                             "<extra></extra>"
            )
        )
    
    # Add Dataset 2 tasks (normalized)
    for i, row in df2.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row['Start_Normalized'], row['End_Normalized'], row['End_Normalized'], row['Start_Normalized'], row['Start_Normalized']],
                y=[row['Task_ID']+0.05, row['Task_ID']+0.05, row['Task_ID']+0.35, row['Task_ID']+0.35, row['Task_ID']+0.05],
                fill='toself',
                fillcolor='rgba(255, 100, 0, 0.6)',
                line=dict(color='red', width=1),
                name=f"Simulation Task {row['Task_ID']}",
                legendgroup="simulation",
                showlegend=i==0,
                hovertemplate=f"<b>Simulation Task {row['Task_ID']}</b><br>" +
                             f"Start: {row['Start']:.1f}<br>" +
                             f"End: {row['End']:.1f}<br>" +
                             f"Duration: {row['Duration']:.1f}<br>" +
                             f"Accelerator: {row.get('accelerator', 'N/A')}<br>" +
                             "<extra></extra>"
            )
        )
    
    # Add legend entries
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', 
                           marker=dict(size=10, color='rgba(0, 100, 255, 0.6)'),
                           legendgroup="schedule", showlegend=True, name="Task Schedule"))
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', 
                           marker=dict(size=10, color='rgba(255, 100, 0, 0.6)'),
                           legendgroup="simulation", showlegend=True, name="Simulation Results"))
    
    fig.update_layout(
        title="Timeline Overlay Comparison (Normalized Time)",
        xaxis_title="Normalized Time",
        yaxis_title="Task ID",
        height=600,
        hovermode='closest'
    )
    
    return fig

# Create and display the summary
summary_df = create_summary_table()

# Create and display the timeline overlay
overlay_fig = create_timeline_overlay()
overlay_fig.show()

print("\n=== FINAL ANALYSIS SUMMARY ===")
print("1. Both datasets contain similar task structures but with different timing characteristics")
print("2. The simulation results show more detailed accelerator assignments")
print("3. Duration differences suggest different execution environments or optimizations")
print("4. Normalized comparisons help identify structural similarities despite timing differences")
print("5. The overlay plot shows how tasks align between the two schedules")

=== COMPREHENSIVE SUMMARY TABLE ===
                  Metric Task Schedule Simulation Results
             Total Tasks            70                 71
    Total Execution Time        375.00             221.00
   Average Task Duration          7.23               4.46
    Median Task Duration          1.50               1.00
       Min Task Duration          0.00               0.00
       Max Task Duration         92.00              73.00
Tasks with Zero Duration             8                  9
   Task Duration Std Dev         19.14              10.70

=== TASK EXECUTION PATTERN ANALYSIS ===
Task Schedule:
  - Overlapping task pairs: 24
  - Total task pairs: 2415
  - Parallelism ratio: 0.010
Simulation Results:
  - Overlapping task pairs: 30
  - Total task pairs: 2485
  - Parallelism ratio: 0.012

=== ACCELERATOR USAGE (Simulation Results) ===
Tasks per accelerator:
  Accelerator 0: 14 tasks
  Accelerator 1: 18 tasks
  Accelerator 2: 18 tasks
  Accelerator 3: 7 tasks
  Accelerator 4: 3


=== FINAL ANALYSIS SUMMARY ===
1. Both datasets contain similar task structures but with different timing characteristics
2. The simulation results show more detailed accelerator assignments
3. Duration differences suggest different execution environments or optimizations
4. Normalized comparisons help identify structural similarities despite timing differences
5. The overlay plot shows how tasks align between the two schedules


In [33]:
# Debug: Check data types and fix the issue
print("=== DATA TYPE DEBUGGING ===")
print("df1 Duration column type:", df1['Duration'].dtype)
print("df2 Duration column type:", df2['Duration'].dtype)
print("Sample values from df1['Duration']:", df1['Duration'].head().tolist())
print("Sample values from df2['Duration']:", df2['Duration'].head().tolist())

# Check if there are any non-numeric values
print("\nChecking for non-numeric values:")
print("df1 Duration non-numeric:", df1['Duration'].apply(lambda x: not isinstance(x, (int, float))).sum())
print("df2 Duration non-numeric:", df2['Duration'].apply(lambda x: not isinstance(x, (int, float))).sum())

# Let's look at the actual comparison_df values
print("\nComparison DataFrame info:")
print("Duration_1 type:", comparison_df['Duration_1'].dtype)
print("Duration_2 type:", comparison_df['Duration_2'].dtype)
print("Duration_Diff type:", comparison_df['Duration_Diff'].dtype)

# Check some actual values
print("\nActual comparison values (first 10):")
print(comparison_df[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff']].head(10))

# Check if the issue is with the difference calculation
print("\nManual calculation check:")
for i in range(5):
    row = comparison_df.iloc[i]
    manual_diff = float(row['Duration_2']) - float(row['Duration_1'])
    print(f"Task {row['Task_ID']}: {row['Duration_1']} - {row['Duration_2']} = {manual_diff} (stored: {row['Duration_Diff']})")

=== DATA TYPE DEBUGGING ===
df1 Duration column type: float64
df2 Duration column type: int64
Sample values from df1['Duration']: [1.0, 4.0, 7.0, 90.0, 5.0]
Sample values from df2['Duration']: [0, 1, 4, 7, 5]

Checking for non-numeric values:
df1 Duration non-numeric: 0
df2 Duration non-numeric: 0

Comparison DataFrame info:
Duration_1 type: float64
Duration_2 type: int64
Duration_Diff type: float64

Actual comparison values (first 10):
   Task_ID  Duration_1  Duration_2  Duration_Diff
0        0         1.0           1            0.0
1        1         4.0           4            0.0
2        2         7.0           7            0.0
3        3        90.0          37          -53.0
4        4         5.0           5            0.0
5        5         0.0           0            0.0
6        6        90.0          73          -17.0
7        7         5.0           5            0.0
8        8         1.0           1            0.0
9        9         3.0           3            0.0

Manual c

In [34]:
def load_and_normalize_data_fixed():
    """Fixed version with proper data type handling"""
    
    # Load first dataset (task_schedule.json)
    with open('/home/sfischer/Documents/projects/wk_LinProg/LinProg_Scripts/gantt_output/task_schedule.json', 'r') as f:
        data1 = json.load(f)
    
    # Load second dataset (simulation results)
    with open('/home/sfischer/Documents/projects/wk_LinProg/simulation/results/log_raw/gantt_timing_combined_log_LinProg_opt_test_rep50_link576_Opt_NoC_1_Comp_0.1_scratch10000_comp1.json', 'r') as f:
        data2 = json.load(f)
    
    # Convert first dataset to DataFrame with explicit type conversion
    df1 = pd.DataFrame(data1)
    df1['Source'] = 'Task Schedule'
    df1['Task_ID'] = df1['Task'].str.extract(r'(\d+)').astype(int)
    
    # Ensure numeric types for timing columns
    df1['Start'] = pd.to_numeric(df1['Start'], errors='coerce')
    df1['End'] = pd.to_numeric(df1['End'], errors='coerce')
    df1['Duration'] = pd.to_numeric(df1['Duration'], errors='coerce')
    
    # Convert second dataset to DataFrame with explicit type conversion
    df2 = pd.DataFrame(data2['tasks'])
    df2['Source'] = 'Simulation Results'
    df2['Task_ID'] = df2['task_id'].astype(int)
    df2['Task'] = 'Task ' + df2['task_id'].astype(str)
    
    # Ensure numeric types for timing columns
    df2['Start'] = pd.to_numeric(df2['start_time'], errors='coerce')
    df2['End'] = pd.to_numeric(df2['end_time'], errors='coerce')
    df2['Duration'] = pd.to_numeric(df2['duration'], errors='coerce')
    
    # Remove any rows with NaN values
    df1 = df1.dropna(subset=['Start', 'End', 'Duration'])
    df2 = df2.dropna(subset=['Start', 'End', 'Duration'])
    
    # Normalize timing values to start from 0
    df1['Start_Normalized'] = df1['Start'] - df1['Start'].min()
    df1['End_Normalized'] = df1['End'] - df1['Start'].min()
    df1['Duration_Normalized'] = df1['Duration']
    
    df2['Start_Normalized'] = df2['Start'] - df2['Start'].min()
    df2['End_Normalized'] = df2['End'] - df2['Start'].min()
    df2['Duration_Normalized'] = df2['Duration']
    
    # Scale to same time range for comparison
    max_time_1 = df1['End_Normalized'].max()
    max_time_2 = df2['End_Normalized'].max()
    
    # Scale both to 0-1 range
    df1['Start_Scaled'] = df1['Start_Normalized'] / max_time_1
    df1['End_Scaled'] = df1['End_Normalized'] / max_time_1
    df1['Duration_Scaled'] = df1['Duration_Normalized'] / max_time_1
    
    df2['Start_Scaled'] = df2['Start_Normalized'] / max_time_2
    df2['End_Scaled'] = df2['End_Normalized'] / max_time_2
    df2['Duration_Scaled'] = df2['Duration_Normalized'] / max_time_2
    
    return df1, df2

def analyze_differences_fixed():
    """Fixed version of difference analysis with proper type handling"""
    
    # Use the fixed data
    df1_fixed, df2_fixed = load_and_normalize_data_fixed()
    
    # Merge datasets by Task_ID for comparison
    common_tasks = set(df1_fixed['Task_ID']) & set(df2_fixed['Task_ID'])
    
    comparison_data = []
    
    for task_id in common_tasks:
        task1 = df1_fixed[df1_fixed['Task_ID'] == task_id].iloc[0]
        task2 = df2_fixed[df2_fixed['Task_ID'] == task_id].iloc[0]
        
        # Ensure all values are properly converted to float
        duration_1 = float(task1['Duration'])
        duration_2 = float(task2['Duration'])
        start_1_norm = float(task1['Start_Normalized'])
        start_2_norm = float(task2['Start_Normalized'])
        
        comparison_data.append({
            'Task_ID': int(task_id),
            'Duration_1': duration_1,
            'Duration_2': duration_2,
            'Duration_Diff': duration_2 - duration_1,
            'Duration_Ratio': duration_2 / duration_1 if duration_1 > 0 else float('inf'),
            'Start_1_Norm': start_1_norm,
            'Start_2_Norm': start_2_norm,
            'Start_Diff_Norm': start_2_norm - start_1_norm,
            'Accelerator': task2.get('accelerator', 'N/A')
        })
    
    comparison_df_fixed = pd.DataFrame(comparison_data)
    
    print("=== FIXED COMPARISON ANALYSIS ===")
    print(f"Common tasks between datasets: {len(common_tasks)}")
    print(f"Tasks only in Dataset 1: {len(set(df1_fixed['Task_ID']) - set(df2_fixed['Task_ID']))}")
    print(f"Tasks only in Dataset 2: {len(set(df2_fixed['Task_ID']) - set(df1_fixed['Task_ID']))}")
    
    print("\n=== DURATION COMPARISON (FIXED) ===")
    print(f"Average duration difference: {comparison_df_fixed['Duration_Diff'].mean():.2f}")
    print(f"Median duration difference: {comparison_df_fixed['Duration_Diff'].median():.2f}")
    print(f"Max duration difference: {comparison_df_fixed['Duration_Diff'].max():.2f}")
    print(f"Min duration difference: {comparison_df_fixed['Duration_Diff'].min():.2f}")
    print(f"Standard deviation of differences: {comparison_df_fixed['Duration_Diff'].std():.2f}")
    
    print("\n=== START TIME COMPARISON (Normalized, FIXED) ===")
    print(f"Average start time difference: {comparison_df_fixed['Start_Diff_Norm'].mean():.2f}")
    print(f"Median start time difference: {comparison_df_fixed['Start_Diff_Norm'].median():.2f}")
    print(f"Max start time difference: {comparison_df_fixed['Start_Diff_Norm'].max():.2f}")
    print(f"Min start time difference: {comparison_df_fixed['Start_Diff_Norm'].min():.2f}")
    
    print("\n=== TASKS WITH LARGEST DIFFERENCES (FIXED) ===")
    # Sort by absolute difference to see both positive and negative differences
    comparison_df_fixed['Abs_Duration_Diff'] = abs(comparison_df_fixed['Duration_Diff'])
    
    print("Top 10 tasks with largest absolute duration differences:")
    top_diff = comparison_df_fixed.nlargest(10, 'Abs_Duration_Diff')[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff', 'Accelerator']]
    print(top_diff.to_string(index=False))
    
    print("\nTop 10 tasks with largest positive duration differences:")
    positive_diff = comparison_df_fixed[comparison_df_fixed['Duration_Diff'] > 0]
    if len(positive_diff) > 0:
        top_pos_diff = positive_diff.nlargest(10, 'Duration_Diff')[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff', 'Accelerator']]
        print(top_pos_diff.to_string(index=False))
    else:
        print("No tasks with positive duration differences found")
    
    print("\nTop 10 tasks with largest negative duration differences:")
    negative_diff = comparison_df_fixed[comparison_df_fixed['Duration_Diff'] < 0]
    if len(negative_diff) > 0:
        top_neg_diff = negative_diff.nsmallest(10, 'Duration_Diff')[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff', 'Accelerator']]
        print(top_neg_diff.to_string(index=False))
    else:
        print("No tasks with negative duration differences found")
    
    # Show tasks with exact matches (zero difference)
    zero_diff = comparison_df_fixed[comparison_df_fixed['Duration_Diff'] == 0.0]
    print(f"\nTasks with exactly matching durations: {len(zero_diff)}")
    if len(zero_diff) > 0:
        print("Sample of tasks with zero difference:")
        print(zero_diff[['Task_ID', 'Duration_1', 'Duration_2', 'Duration_Diff', 'Accelerator']].head(10).to_string(index=False))
    
    return comparison_df_fixed, df1_fixed, df2_fixed

# Run the fixed analysis
comparison_df_fixed, df1_fixed, df2_fixed = analyze_differences_fixed()

=== FIXED COMPARISON ANALYSIS ===
Common tasks between datasets: 70
Tasks only in Dataset 1: 0
Tasks only in Dataset 2: 1

=== DURATION COMPARISON (FIXED) ===
Average duration difference: -2.70
Median duration difference: 0.00
Max duration difference: 0.00
Min duration difference: -53.00
Standard deviation of differences: 9.74

=== START TIME COMPARISON (Normalized, FIXED) ===
Average start time difference: -104.69
Median start time difference: -114.00
Max start time difference: 0.00
Min start time difference: -168.00

=== TASKS WITH LARGEST DIFFERENCES (FIXED) ===
Top 10 tasks with largest absolute duration differences:
 Task_ID  Duration_1  Duration_2  Duration_Diff  Accelerator
       3        90.0        37.0          -53.0           10
      10        92.0        39.0          -53.0            7
      16        33.0        14.0          -19.0            3
      34        33.0        14.0          -19.0            2
      52        37.0        19.0          -18.0            1
     

In [35]:
def create_improved_difference_plots(comparison_df_fixed):
    """Create improved plots with the fixed data"""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Duration Comparison (Fixed)', 'Duration Differences (Fixed)', 
                       'Start Time Comparison (Fixed)', 'Duration Ratio Distribution (Fixed)'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Duration comparison scatter plot
    fig.add_trace(
        go.Scatter(
            x=comparison_df_fixed['Duration_1'],
            y=comparison_df_fixed['Duration_2'],
            mode='markers',
            name='Duration Comparison',
            text=comparison_df_fixed['Task_ID'],
            hovertemplate='<b>Task %{text}</b><br>' +
                         'Dataset 1: %{x:.1f}<br>' +
                         'Dataset 2: %{y:.1f}<br>' +
                         'Difference: %{customdata:.1f}<br>' +
                         '<extra></extra>',
            customdata=comparison_df_fixed['Duration_Diff'],
            marker=dict(size=8, color='blue', opacity=0.6)
        ),
        row=1, col=1
    )
    
    # Add diagonal line for reference
    max_duration = max(comparison_df_fixed['Duration_1'].max(), comparison_df_fixed['Duration_2'].max())
    fig.add_trace(
        go.Scatter(
            x=[0, max_duration],
            y=[0, max_duration],
            mode='lines',
            name='Equal Duration',
            line=dict(dash='dash', color='red'),
            showlegend=False
        ),
        row=1, col=1
    )
    
    # Duration differences bar chart
    # Color bars based on positive/negative differences
    bar_colors = ['green' if x > 0 else 'red' if x < 0 else 'gray' for x in comparison_df_fixed['Duration_Diff']]
    
    fig.add_trace(
        go.Bar(
            x=comparison_df_fixed['Task_ID'],
            y=comparison_df_fixed['Duration_Diff'],
            name='Duration Difference',
            marker=dict(color=bar_colors),
            hovertemplate='<b>Task %{x}</b><br>' +
                         'Difference: %{y:.1f}<br>' +
                         '<extra></extra>'
        ),
        row=1, col=2
    )
    
    # Start time comparison
    fig.add_trace(
        go.Scatter(
            x=comparison_df_fixed['Start_1_Norm'],
            y=comparison_df_fixed['Start_2_Norm'],
            mode='markers',
            name='Start Time Comparison',
            text=comparison_df_fixed['Task_ID'],
            hovertemplate='<b>Task %{text}</b><br>' +
                         'Dataset 1: %{x:.1f}<br>' +
                         'Dataset 2: %{y:.1f}<br>' +
                         'Difference: %{customdata:.1f}<br>' +
                         '<extra></extra>',
            customdata=comparison_df_fixed['Start_Diff_Norm'],
            marker=dict(size=8, color='orange', opacity=0.6)
        ),
        row=2, col=1
    )
    
    # Add diagonal line for start time reference
    max_start = max(comparison_df_fixed['Start_1_Norm'].max(), comparison_df_fixed['Start_2_Norm'].max())
    fig.add_trace(
        go.Scatter(
            x=[0, max_start],
            y=[0, max_start],
            mode='lines',
            name='Equal Start Time',
            line=dict(dash='dash', color='red'),
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Duration ratio histogram (filter out infinite values)
    finite_ratios = comparison_df_fixed[comparison_df_fixed['Duration_Ratio'] != float('inf')]['Duration_Ratio']
    
    fig.add_trace(
        go.Histogram(
            x=finite_ratios,
            nbinsx=30,
            name='Duration Ratio Distribution',
            marker=dict(color='purple', opacity=0.7),
            hovertemplate='Ratio: %{x:.2f}<br>' +
                         'Count: %{y}<br>' +
                         '<extra></extra>'
        ),
        row=2, col=2
    )
    
    # Add vertical line at ratio = 1 for reference
    fig.add_vline(x=1, line_dash="dash", line_color="red", row=2, col=2)
    
    fig.update_layout(
        title="Detailed Comparison Analysis (Fixed Data Types)",
        height=800,
        showlegend=False
    )
    
    fig.update_xaxes(title_text="Dataset 1 Duration", row=1, col=1)
    fig.update_yaxes(title_text="Dataset 2 Duration", row=1, col=1)
    fig.update_xaxes(title_text="Task ID", row=1, col=2)
    fig.update_yaxes(title_text="Duration Difference", row=1, col=2)
    fig.update_xaxes(title_text="Dataset 1 Start Time", row=2, col=1)
    fig.update_yaxes(title_text="Dataset 2 Start Time", row=2, col=1)
    fig.update_xaxes(title_text="Duration Ratio", row=2, col=2)
    fig.update_yaxes(title_text="Frequency", row=2, col=2)
    
    return fig

def create_distribution_analysis():
    """Analyze the distribution of differences"""
    
    print("=== DISTRIBUTION ANALYSIS OF DIFFERENCES ===")
    
    # Basic statistics
    diff_stats = comparison_df_fixed['Duration_Diff'].describe()
    print("Duration Difference Statistics:")
    print(diff_stats)
    
    # Count tasks in different categories
    positive_diff = len(comparison_df_fixed[comparison_df_fixed['Duration_Diff'] > 0])
    negative_diff = len(comparison_df_fixed[comparison_df_fixed['Duration_Diff'] < 0])
    zero_diff = len(comparison_df_fixed[comparison_df_fixed['Duration_Diff'] == 0])
    
    print(f"\nTask Distribution:")
    print(f"Tasks with longer duration in simulation: {positive_diff}")
    print(f"Tasks with shorter duration in simulation: {negative_diff}")
    print(f"Tasks with identical duration: {zero_diff}")
    
    # Analyze by accelerator
    if 'Accelerator' in comparison_df_fixed.columns:
        print(f"\nDuration Differences by Accelerator:")
        accel_analysis = comparison_df_fixed.groupby('Accelerator')['Duration_Diff'].agg(['mean', 'std', 'count'])
        print(accel_analysis)
    
    return diff_stats

# Create improved visualizations
improved_fig = create_improved_difference_plots(comparison_df_fixed)
improved_fig.show()

# Perform distribution analysis
dist_stats = create_distribution_analysis()

=== DISTRIBUTION ANALYSIS OF DIFFERENCES ===
Duration Difference Statistics:
count    70.000000
mean     -2.700000
std       9.737645
min     -53.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       0.000000
Name: Duration_Diff, dtype: float64

Task Distribution:
Tasks with longer duration in simulation: 0
Tasks with shorter duration in simulation: 7
Tasks with identical duration: 63

Duration Differences by Accelerator:
                  mean        std  count
Accelerator                             
0             0.000000   0.000000     14
1            -1.000000   4.242641     18
2            -1.055556   4.478343     18
3            -2.714286   7.181325      7
4            -3.333333   5.773503      3
6            -5.666667   9.814955      3
7           -17.666667  30.599564      3
10          -13.250000  26.500000      4
