import os import json import pandas as pd import matplotlib.pyplot as plt data_dir = './' all_data = [] all_tap_logs = [] for filename in os.listdir(data_dir): if filename.endswith('.json'): file_path = os.path.join(data_dir, filename) with open(file_path, encoding='utf-8') as file: data = json.load(file) tap_logs = data.get('sensorLog', {}).get('tapLog', []) for entry in tap_logs: entry['participant_id'] = filename all_tap_logs.append(entry) df = pd.DataFrame(all_tap_logs) df['timestamp'] = pd.to_datetime(df['timestamp']) df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1]) # Mapping with tour_operators being mergeable to study page 2. :) path_to_label = { "study-page-1": "Study Page 1", "study-page-2": "Study Page 2", "study-page-3": "Study Page 3", "study-page-4": "Study Page 4", "study-page-5": "Study Page 5", "study-page-6": "Study Page 6", "tour_operators": "Study Page 2" } df['label'] = df['path'].map(path_to_label) completion_times = df.groupby(['participant_id', 'label'])['timestamp'].agg(['min', 'max']).reset_index() completion_times['completion_time'] = (completion_times['max'] - completion_times['min']).dt.total_seconds() # Filter out technical outliers comp_query = 'completion_time < 500' average_completion_times = completion_times.query(comp_query).groupby('label')['completion_time'].mean().reset_index() c_times_by_page = completion_times.query(comp_query).groupby('label', group_keys=True)[['completion_time']].apply(lambda x: x) c_times_list = [c_times_by_page.groupby('label').get_group('Study Page 1')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 2')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 3')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 4')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 5')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 6')['completion_time']] # Draw plots plt.figure(figsize=(10, 6)) #plt.bar(average_completion_times['label'], average_completion_times['completion_time'], color='skyblue') plt.boxplot(c_times_list) plt.xlabel('Page') plt.xticks([1,2,3,4,5,6], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"]) plt.ylabel('Average Task Completion Time (s)') plt.title('Average Task Completion Time by Page') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show()