import os import json import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns data_dir = './' all_data = [] all_tap_logs = [] for filename in os.listdir(data_dir): if filename.endswith('.json'): file_path = os.path.join(data_dir, filename) with open(file_path, encoding='utf-8') as file: data = json.load(file) tap_logs = data.get('sensorLog', {}).get('tapLog', []) for entry in tap_logs: entry['participant_id'] = filename all_tap_logs.append(entry) df = pd.DataFrame(all_tap_logs) df['timestamp'] = pd.to_datetime(df['timestamp']) df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1]) # Mapping with tour_operators being mergeable to study page 2. :) path_to_label = { "study-page-1": "Study Page 1", "study-page-2": "Study Page 2", "study-page-3": "Study Page 3", "study-page-4": "Study Page 4", "study-page-5": "Study Page 5", "study-page-6": "Study Page 6", "tour_operators": "Study Page 2" } df['label'] = df['path'].map(path_to_label) def calculate_distances(group): group = group.sort_values(by='timestamp') x_diff = group['x'].diff().fillna(0) y_diff = group['y'].diff().fillna(0) distances = np.sqrt(x_diff**2 + y_diff**2) total_distance = distances.sum() return total_distance grouped = df.groupby(['participant_id', 'label']) distance_data = grouped.apply(calculate_distances).reset_index() distance_data.columns = ['participant_id', 'label', 'total_distance'] def generate_heatmap_data(group): heatmap_data = group[['x', 'y']].copy() heatmap_data['radius'] = 40 heatmap_data['value'] = 5 heatmap_data['x'] = heatmap_data['x'].astype(str) heatmap_data['y'] = heatmap_data['y'].astype(str) heatmap_data_list = heatmap_data.to_dict(orient='records') min_value = 1 max_value = 999 return { "min": min_value, "max": max_value, "data": heatmap_data_list } for label, group in df.groupby('label'): heatmap_data = generate_heatmap_data(group) json_filename = f"{label.replace(' ', '_').lower()}.json" with open(json_filename, 'w', encoding='utf-8') as json_file: json.dump(heatmap_data, json_file, indent=4) print(f"Generated {json_filename} with {len(heatmap_data['data'])} records.") distance_data.to_csv('distance_data.csv', index=False) print("Distance data saved to distance_data.csv") # Filter out technical outliers... comp_query = 'total_distance < 15000' # Boxplot drawing plt.figure(figsize=(12, 6)) sns.boxplot(x='label', y='total_distance', data=distance_data.query(comp_query).apply(lambda x: x)) plt.xticks(rotation=45) plt.xlabel('Study Page') plt.xticks([0,1,2,3,4,5], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"]) plt.ylabel('Total Distance Traveled (pixels)') plt.title('Total Distance Traveled per Study Page') plt.tight_layout() plt.savefig('distance_boxplot.png') plt.show()