Model: gpt-5.2

📋 Dashboard Overview

📋 Configuration Summaries

⚙️ text_active_think

📊 Samples: 10

🔍 Exploration

avg_node_coverage: 0.957
avg_edge_coverage: 0.461
avg_exploration_steps: 12.300
avg_action_cost: 11.300
avg_action_fail_ratio: 0.012
avg_valid_action_ratio: 1
avg_final_information_gain: 0.794
avg_false_belief_steps: 8.300
avg_false_belief_f1: 0.950
avg_false_belief_f1_position: 0.980
avg_false_belief_f1_facing: 0.917
avg_false_belief_action_cost: 7.300
avg_false_belief_action_cost_after_seen: 0.875
avg_action_counts:
move: 2.700
rotate: 7.700
return: 0
observe: 11.300
term: 0.900
forced_term: 0.100
query: 0

✅ Evaluation

avg_accuracy: 0.671
task_metrics:
DirectionEvaluationTask:
accuracy: 0.717
total_count: 30
task_score: 21.500
PovEvaluationTask:
accuracy: 0.717
total_count: 30
task_score: 21.500
BackwardPovTextEvaluationTask:
accuracy: 0.800
total_count: 30
task_score: 24
View2ActionTextEvaluationTask:
accuracy: 0.267
total_count: 30
task_score: 8
AlloMappingEvaluationTask:
accuracy: 0.780
total_count: 30
task_score: 23.409
RotEvaluationTask:
accuracy: 0.800
total_count: 30
task_score: 24
Location2ViewEvaluationTask:
accuracy: 0.567
total_count: 30
task_score: 17
View2LocationTextEvaluationTask:
accuracy: 0.741
total_count: 30
task_score: 22.231
Action2ViewEvaluationTask:
accuracy: 0.650
total_count: 30
task_score: 19.500

✅ Evaluation (prompt_cogmap)

avg_accuracy: 0.683
task_metrics:
DirectionEvaluationTask:
accuracy: 0.700
total_count: 30
task_score: 21
PovEvaluationTask:
accuracy: 0.667
total_count: 30
task_score: 20
BackwardPovTextEvaluationTask:
accuracy: 0.867
total_count: 30
task_score: 26
Action2ViewEvaluationTask:
accuracy: 0.650
total_count: 30
task_score: 19.500
View2ActionTextEvaluationTask:
accuracy: 0.533
total_count: 30
task_score: 16
AlloMappingEvaluationTask:
accuracy: 0.701
total_count: 30
task_score: 21.035
RotEvaluationTask:
accuracy: 0.867
total_count: 30
task_score: 26
Location2ViewEvaluationTask:
accuracy: 0.517
total_count: 30
task_score: 15.500
View2LocationTextEvaluationTask:
accuracy: 0.641
total_count: 30
task_score: 19.240

✅ Evaluation (use_gt_cogmap)

avg_accuracy: 0.954
task_metrics:
DirectionEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
PovEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
BackwardPovTextEvaluationTask:
accuracy: 0.967
total_count: 30
task_score: 29
Action2ViewEvaluationTask:
accuracy: 0.983
total_count: 30
task_score: 29.500
View2ActionTextEvaluationTask:
accuracy: 0.767
total_count: 30
task_score: 23
AlloMappingEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
RotEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
Location2ViewEvaluationTask:
accuracy: 0.933
total_count: 30
task_score: 28
View2LocationTextEvaluationTask:
accuracy: 0.940
total_count: 30
task_score: 28.197

✅ Evaluation (use_model_cogmap)

avg_accuracy: 0.676
task_metrics:
DirectionEvaluationTask:
accuracy: 0.700
total_count: 30
task_score: 21
PovEvaluationTask:
accuracy: 0.583
total_count: 30
task_score: 17.500
BackwardPovTextEvaluationTask:
accuracy: 0.833
total_count: 30
task_score: 25
Action2ViewEvaluationTask:
accuracy: 0.617
total_count: 30
task_score: 18.500
View2ActionTextEvaluationTask:
accuracy: 0.467
total_count: 30
task_score: 14
AlloMappingEvaluationTask:
accuracy: 0.739
total_count: 30
task_score: 22.180
RotEvaluationTask:
accuracy: 0.933
total_count: 30
task_score: 28
Location2ViewEvaluationTask:
accuracy: 0.533
total_count: 30
task_score: 16
View2LocationTextEvaluationTask:
accuracy: 0.681
total_count: 30
task_score: 20.424

🧠 Cognitive Map

exploration:
error:
global_vs_gt_global_avg:
dir: 0.822
facing: 0.953
overall: 0.865
pos: 0.820
newly_observed_vs_gt_local_avg:
dir: 0.780
facing: 1
overall: 0.877
pos: 0.850
agent_vs_gt_agent_avg:
dir: 0.796
facing: 0.990
overall: 0.891
pos: 0.887
local_vs_gt_local_avg:
dir: 0.790
facing: 1
overall: 0.880
pos: 0.850
consistency:
position_update_avg: 0.689
facing_update_avg: 0.956
position_stability_avg: 0.689
facing_stability_avg: 0.956
local_vs_global_avg:
dir: 0.735
facing: 0.933
overall: 0.841
pos: 0.856
correctness:
last_global_vs_gt_full:
dir: 0.757
facing: 0.957
overall: 0.825
pos: 0.763
n_samples: Global: 10, Local: 10, Newly: 10
evaluation:
correctness:
(none)

🌫️ Fog Probe

recall_avg: 0.785
f1_avg: 0.605
precision_avg: 0.523
n_samples: 10

🧭 False Belief CogMap

inertia: 0.181
changed:
dir: None
facing: 0.650
overall: None
pos: 0.668
retention:
dir: None
facing: 0
overall: None
pos: 0.376
unchanged:
dir: 0.609
facing: 0.779
overall: 0.677
pos: 0.643
unchanged_retention:
dir: None
facing: 0.865
overall: None
pos: 0.761
unchanged_retention_minus_retention:
facing: 1
pos: 0.685
unchanged_exploration:
dir: 0.789
facing: 1
overall: 0.862
pos: 0.795

📈 Correlation

cogmap_acc_correlations:
avg_accuracy:
pearson_r: 0.810
p_value: 0.004
significant: True
n_samples: 10
DirectionEvaluationTask:
pearson_r: 0.380
p_value: 0.279
significant: False
n_samples: 10
View2ActionTextEvaluationTask:
pearson_r: 0.165
p_value: 0.650
significant: False
n_samples: 10
RotDualEvaluationTask:
pearson_r: NaN
p_value: NaN
significant: False
n_samples: 10
View2LocationTextEvaluationTask:
pearson_r: 0.132
p_value: 0.717
significant: False
n_samples: 10
RotEvaluationTask:
pearson_r: 0.403
p_value: 0.248
significant: False
n_samples: 10
AlloMappingEvaluationTask:
pearson_r: 0.720
p_value: 0.019
significant: True
n_samples: 10
Location2ViewEvaluationTask:
pearson_r: 0.409
p_value: 0.240
significant: False
n_samples: 10
PovEvaluationTask:
pearson_r: 0.631
p_value: 0.050
significant: False
n_samples: 10
Action2ViewEvaluationTask:
pearson_r: 0.715
p_value: 0.020
significant: True
n_samples: 10
BackwardPovTextEvaluationTask:
pearson_r: 0.399
p_value: 0.253
significant: False
n_samples: 10
cogmap_infogain_correlation:
pearson_r: 0.403
p_value: 0.249
significant: False
n_samples: 10
n_samples: 10
Performance Charts
Information Gain per Turn
Information Gain per Turn
Cognitive Map (Update)
Cognitive Map Update Turn Averages
Cognitive Map (Full)
Cognitive Map Full Turn Averages
Cognitive Map (Self-Tracking)
Cognitive Map Self-Tracking Turn Averages
Fog Probe F1
Fog Probe F1 per Turn
Fog Probe Precision
Fog Probe Precision per Turn
Fog Probe Recall
Fog Probe Recall per Turn
FB CogMap (Unchanged)
False Belief CogMap Unchanged per Turn
Position Update
Position Update per Turn
Facing Update
Facing Update per Turn
Position Stability
Position Stability per Turn
Facing Stability
Facing Stability per Turn
CogMap vs Accuracy
Cognitive Map vs Accuracy Correlation
CogMap vs InfoGain
Cognitive Map vs Information Gain Correlation

⚙️ text_passive_think_strategist

📊 Samples: 10

🔍 Exploration

avg_node_coverage: None
avg_edge_coverage: None
avg_exploration_steps: None
avg_action_cost: None
avg_action_fail_ratio: None
avg_valid_action_ratio: None
avg_final_information_gain: None
avg_false_belief_steps: None
avg_false_belief_f1: None
avg_false_belief_f1_position: None
avg_false_belief_f1_facing: None
avg_false_belief_action_cost: None
avg_false_belief_action_cost_after_seen: None

✅ Evaluation

avg_accuracy: 0.910
task_metrics:
DirectionEvaluationTask:
accuracy: 0.800
total_count: 30
task_score: 24
PovEvaluationTask:
accuracy: 0.933
total_count: 30
task_score: 28
BackwardPovTextEvaluationTask:
accuracy: 0.967
total_count: 30
task_score: 29
View2ActionTextEvaluationTask:
accuracy: 0.767
total_count: 30
task_score: 23
AlloMappingEvaluationTask:
accuracy: 0.993
total_count: 30
task_score: 29.785
RotEvaluationTask:
accuracy: 0.967
total_count: 30
task_score: 29
Location2ViewEvaluationTask:
accuracy: 0.900
total_count: 30
task_score: 27
View2LocationTextEvaluationTask:
accuracy: 0.951
total_count: 30
task_score: 28.543
Action2ViewEvaluationTask:
accuracy: 0.917
total_count: 30
task_score: 27.500

🧠 Cognitive Map

exploration:
correctness:
global_full:
(none)
n_samples: 0

⚙️ vision_active_think

📊 Samples: 10

🔍 Exploration

avg_node_coverage: 0.993
avg_edge_coverage: 0.537
avg_exploration_steps: 19.300
avg_action_cost: 17.900
avg_action_fail_ratio: 0.051
avg_valid_action_ratio: 1
avg_final_information_gain: 0.866
avg_false_belief_steps: 13.300
avg_false_belief_f1: 0.550
avg_false_belief_f1_position: 0.780
avg_false_belief_f1_facing: 0.117
avg_false_belief_action_cost: 12.300
avg_false_belief_action_cost_after_seen: 4
avg_action_counts:
move: 4.900
rotate: 12.500
return: 0
observe: 17.900
term: 0.900
forced_term: 0.100
query: 0

✅ Evaluation

avg_accuracy: 0.478
task_metrics:
DirectionEvaluationTask:
accuracy: 0.533
total_count: 30
task_score: 16
PovEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
BackwardPovTextEvaluationTask:
accuracy: 0.600
total_count: 30
task_score: 18
BackwardPovVisionEvaluationTask:
accuracy: 0.233
total_count: 30
task_score: 7
View2ActionTextEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
View2ActionVisionEvaluationTask:
accuracy: 0.400
total_count: 30
task_score: 12
AlloMappingEvaluationTask:
accuracy: 0.481
total_count: 30
task_score: 14.425
RotEvaluationTask:
accuracy: 0.700
total_count: 30
task_score: 21
Location2ViewEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
View2LocationTextEvaluationTask:
accuracy: 0.504
total_count: 30
task_score: 15.115
View2LocationVisionEvaluationTask:
accuracy: 0.414
total_count: 30
task_score: 12.431
Action2ViewEvaluationTask:
accuracy: 0.383
total_count: 30
task_score: 11.500

✅ Evaluation (prompt_cogmap)

avg_accuracy: 0.391
task_metrics:
DirectionEvaluationTask:
accuracy: 0.350
total_count: 30
task_score: 10.500
PovEvaluationTask:
accuracy: 0.283
total_count: 30
task_score: 8.500
BackwardPovTextEvaluationTask:
accuracy: 0.533
total_count: 30
task_score: 16
BackwardPovVisionEvaluationTask:
accuracy: 0.400
total_count: 30
task_score: 12
Action2ViewEvaluationTask:
accuracy: 0.350
total_count: 30
task_score: 10.500
View2ActionTextEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
View2ActionVisionEvaluationTask:
accuracy: 0.400
total_count: 30
task_score: 12
AlloMappingEvaluationTask:
accuracy: 0.438
total_count: 30
task_score: 13.145
RotEvaluationTask:
accuracy: 0.400
total_count: 30
task_score: 12
Location2ViewEvaluationTask:
accuracy: 0.350
total_count: 30
task_score: 10.500
View2LocationTextEvaluationTask:
accuracy: 0.444
total_count: 30
task_score: 13.315
View2LocationVisionEvaluationTask:
accuracy: 0.398
total_count: 30
task_score: 11.949

✅ Evaluation (use_gt_cogmap)

avg_accuracy: 0.940
task_metrics:
DirectionEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
PovEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
BackwardPovTextEvaluationTask:
accuracy: 0.933
total_count: 30
task_score: 28
BackwardPovVisionEvaluationTask:
accuracy: 0.767
total_count: 30
task_score: 23
Action2ViewEvaluationTask:
accuracy: 0.950
total_count: 30
task_score: 28.500
View2ActionTextEvaluationTask:
accuracy: 0.600
total_count: 30
task_score: 18
View2ActionVisionEvaluationTask:
accuracy: 0.633
total_count: 30
task_score: 19
AlloMappingEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
RotEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
Location2ViewEvaluationTask:
accuracy: 1
total_count: 30
task_score: 30
View2LocationTextEvaluationTask:
accuracy: 0.978
total_count: 30
task_score: 29.342
View2LocationVisionEvaluationTask:
accuracy: 0.779
total_count: 30
task_score: 23.377

✅ Evaluation (use_model_cogmap)

avg_accuracy: 0.396
task_metrics:
DirectionEvaluationTask:
accuracy: 0.467
total_count: 30
task_score: 14
PovEvaluationTask:
accuracy: 0.167
total_count: 30
task_score: 5
BackwardPovTextEvaluationTask:
accuracy: 0.433
total_count: 30
task_score: 13
BackwardPovVisionEvaluationTask:
accuracy: 0.500
total_count: 30
task_score: 15
Action2ViewEvaluationTask:
accuracy: 0.450
total_count: 30
task_score: 13.500
View2ActionTextEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
View2ActionVisionEvaluationTask:
accuracy: 0.500
total_count: 30
task_score: 15
AlloMappingEvaluationTask:
accuracy: 0.397
total_count: 30
task_score: 11.916
RotEvaluationTask:
accuracy: 0.500
total_count: 30
task_score: 15
Location2ViewEvaluationTask:
accuracy: 0.283
total_count: 30
task_score: 8.500
View2LocationTextEvaluationTask:
accuracy: 0.498
total_count: 30
task_score: 14.945
View2LocationVisionEvaluationTask:
accuracy: 0.391
total_count: 30
task_score: 11.734

🧠 Cognitive Map

exploration:
error:
global_vs_gt_global_avg:
dir: 0.497
facing: 0.297
overall: 0.447
pos: 0.546
newly_observed_vs_gt_local_avg:
dir: 0.642
facing: 0.383
overall: 0.594
pos: 0.756
agent_vs_gt_agent_avg:
dir: 0.568
facing: 0.979
overall: 0.745
pos: 0.687
local_vs_gt_local_avg:
dir: 0.643
facing: 0.314
overall: 0.566
pos: 0.740
consistency:
position_update_avg: 0.572
facing_update_avg: 0.603
position_stability_avg: 0.572
facing_stability_avg: 0.603
local_vs_global_avg:
dir: 0.446
facing: 0.528
overall: 0.529
pos: 0.615
correctness:
last_global_vs_gt_full:
dir: 0.420
facing: 0.206
overall: 0.373
pos: 0.493
n_samples: Global: 10, Local: 10, Newly: 10
evaluation:
correctness:
(none)

🌫️ Fog Probe

recall_avg: 0.609
f1_avg: 0.552
precision_avg: 0.551
n_samples: 10

🧭 False Belief CogMap

inertia: 0.402
changed:
dir: None
facing: 0.300
overall: None
pos: 0.461
retention:
dir: None
facing: 0.550
overall: None
pos: 0.415
unchanged:
dir: 0.373
facing: 0.245
overall: 0.359
pos: 0.458
unchanged_retention:
dir: None
facing: 0.740
overall: None
pos: 0.594
unchanged_retention_minus_retention:
facing: 0.105
pos: 0.442
unchanged_exploration:
dir: 0.471
facing: 0.113
overall: 0.371
pos: 0.529

📈 Correlation

cogmap_acc_correlations:
avg_accuracy:
pearson_r: -0.229
p_value: 0.524
significant: False
n_samples: 10
DirectionEvaluationTask:
pearson_r: -0.107
p_value: 0.769
significant: False
n_samples: 10
View2ActionTextEvaluationTask:
pearson_r: -0.428
p_value: 0.217
significant: False
n_samples: 10
RotDualEvaluationTask:
pearson_r: 0.228
p_value: 0.526
significant: False
n_samples: 10
View2LocationTextEvaluationTask:
pearson_r: -0.207
p_value: 0.567
significant: False
n_samples: 10
RotEvaluationTask:
pearson_r: 0.553
p_value: 0.098
significant: False
n_samples: 10
AlloMappingEvaluationTask:
pearson_r: 0.269
p_value: 0.452
significant: False
n_samples: 10
View2LocationVisionEvaluationTask:
pearson_r: -0.094
p_value: 0.797
significant: False
n_samples: 10
Location2ViewEvaluationTask:
pearson_r: 0.190
p_value: 0.598
significant: False
n_samples: 10
View2ActionVisionEvaluationTask:
pearson_r: -0.497
p_value: 0.144
significant: False
n_samples: 10
PovEvaluationTask:
pearson_r: -0.117
p_value: 0.747
significant: False
n_samples: 10
Action2ViewEvaluationTask:
pearson_r: -0.665
p_value: 0.036
significant: True
n_samples: 10
BackwardPovVisionEvaluationTask:
pearson_r: -0.311
p_value: 0.382
significant: False
n_samples: 10
BackwardPovTextEvaluationTask:
pearson_r: -0.186
p_value: 0.607
significant: False
n_samples: 10
cogmap_infogain_correlation:
pearson_r: 0.157
p_value: 0.665
significant: False
n_samples: 10
n_samples: 10
Performance Charts
Information Gain per Turn
Information Gain per Turn
Cognitive Map (Update)
Cognitive Map Update Turn Averages
Cognitive Map (Full)
Cognitive Map Full Turn Averages
Cognitive Map (Self-Tracking)
Cognitive Map Self-Tracking Turn Averages
Fog Probe F1
Fog Probe F1 per Turn
Fog Probe Precision
Fog Probe Precision per Turn
Fog Probe Recall
Fog Probe Recall per Turn
FB CogMap (Unchanged)
False Belief CogMap Unchanged per Turn
Position Update
Position Update per Turn
Facing Update
Facing Update per Turn
Position Stability
Position Stability per Turn
Facing Stability
Facing Stability per Turn
CogMap vs Accuracy
Cognitive Map vs Accuracy Correlation
CogMap vs InfoGain
Cognitive Map vs Information Gain Correlation

⚙️ vision_passive_think_scout

📊 Samples: 10

🔍 Exploration

avg_node_coverage: None
avg_edge_coverage: None
avg_exploration_steps: None
avg_action_cost: None
avg_action_fail_ratio: None
avg_valid_action_ratio: None
avg_final_information_gain: None
avg_false_belief_steps: None
avg_false_belief_f1: None
avg_false_belief_f1_position: None
avg_false_belief_f1_facing: None
avg_false_belief_action_cost: None
avg_false_belief_action_cost_after_seen: None

✅ Evaluation

avg_accuracy: 0.572
task_metrics:
DirectionEvaluationTask:
accuracy: 0.517
total_count: 30
task_score: 15.500
PovEvaluationTask:
accuracy: 0.350
total_count: 30
task_score: 10.500
BackwardPovTextEvaluationTask:
accuracy: 0.633
total_count: 30
task_score: 19
BackwardPovVisionEvaluationTask:
accuracy: 0.367
total_count: 30
task_score: 11
View2ActionTextEvaluationTask:
accuracy: 0.400
total_count: 30
task_score: 12
View2ActionVisionEvaluationTask:
accuracy: 0.433
total_count: 30
task_score: 13
AlloMappingEvaluationTask:
accuracy: 0.658
total_count: 30
task_score: 19.745
RotEvaluationTask:
accuracy: 0.867
total_count: 30
task_score: 26
Location2ViewEvaluationTask:
accuracy: 0.467
total_count: 30
task_score: 14
View2LocationTextEvaluationTask:
accuracy: 0.592
total_count: 30
task_score: 17.759
View2LocationVisionEvaluationTask:
accuracy: 0.594
total_count: 30
task_score: 17.821
Action2ViewEvaluationTask:
accuracy: 0.667
total_count: 30
task_score: 20

🧠 Cognitive Map

exploration:
correctness:
global_full:
(none)
n_samples: 0

📖 Sample Navigation