Added scripts

3 months ago · a852defa17
5 changed files with 347 additions and 0 deletions
--- a/analysis_helper.py
+++ b/analysis_helper.py
@ -0,0 +1,100 @@
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+import scipy.stats as stats
+from scipy.stats import f_oneway, friedmanchisquare, mannwhitneyu, ranksums
+import numpy as np
+import pingouin as pg
+
+# SUS
+df = pd.read_csv('QuestionnaireDataSUS.csv')
+df['TotalIMIScore'] = ((df['Q1'] - 1) + (5 - df['Q2']) + (df['Q3'] - 1) + (5 - df['Q4']) + (df['Q5'] - 1) + (5 - df['Q6']) + (df['Q7'] - 1) + (5 - df['Q8']) + (df['Q9'] - 1) + (5 - df['Q10'])) * 2.5
+
+# IMI
+#df = pd.read_csv('QuestionnaireDataIMI.csv')
+#df['TotalIMIScore'] = (df['Q1'] + df['Q2'] + (8 - df['Q3']) + (8 - df['Q4']) + df['Q5'] + df['Q6'] + df['Q7']) / 7.0
+
+grouped = df.groupby('WebpageID').agg(
+    mean_IMIScore=('TotalIMIScore', 'mean'),
+    std_IMIScore=('TotalIMIScore', 'std'),
+    count=('TotalIMIScore', 'count')
+)
+grouped['variance_IMIScore'] = grouped['std_IMIScore'] ** 2
+
+anova_data = [group['TotalIMIScore'].values for name, group in df.groupby('WebpageID')]
+anova_result = f_oneway(*anova_data)
+print(f"ANOVA Result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}")
+
+friedman_data = df.pivot(index='ParticipantID', columns='WebpageID', values='TotalIMIScore').dropna()
+spher, W, chisq, dof, pval = pg.sphericity(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID')
+gg = pg.epsilon(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID', correction='gg')
+print(gg)
+
+friedman_result = friedmanchisquare(*[friedman_data[col] for col in friedman_data])
+print(f"Friedman Test Result: Chi-square statistic = {friedman_result.statistic}, p-value = {friedman_result.pvalue}")
+
+model = ols('TotalIMIScore ~ C(WebpageID)', data=df).fit()
+anova_table = sm.stats.anova_lm(model, typ=2)
+print(anova_table)
+print(stats.shapiro(anova_data[0]))
+print(stats.shapiro(anova_data[1]))
+print(stats.shapiro(anova_data[2]))
+print(stats.shapiro(anova_data[3]))
+print(stats.shapiro(anova_data[4]))
+print(stats.shapiro(anova_data[5]))
+print(stats.levene(*anova_data))
+print(spher, round(W, 5), round(chisq, 3), dof, round(pval, 3))
+
+group_1 = df[df['WebpageID'] == 1]['TotalIMIScore']
+group_2 = df[df['WebpageID'] == 2]['TotalIMIScore']
+group_3 = df[df['WebpageID'] == 3]['TotalIMIScore']
+group_4 = df[df['WebpageID'] == 4]['TotalIMIScore']
+group_5 = df[df['WebpageID'] == 5]['TotalIMIScore']
+group_6 = df[df['WebpageID'] == 6]['TotalIMIScore']
+
+def mann_whitney_test(group_a, group_b):
+    u_statistic, p_value = mannwhitneyu(group_a, group_b)
+    n1 = len(group_a)
+    n2 = len(group_b)
+    mu_u = n1 * n2 / 2
+    sigma_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
+    z_value = (u_statistic - mu_u) / sigma_u
+    effect_size_r = z_value / np.sqrt(n1 + n2)
+    return u_statistic, p_value, z_value, effect_size_r
+
+def wilcoxon_rank_sum_test(group_a, group_b):
+    rank_sum_statistic, p_value = ranksums(group_a, group_b)
+    effect_size_r = rank_sum_statistic / np.sqrt(len(group_a) + len(group_b))
+    return rank_sum_statistic, p_value, effect_size_r
+
+comparisons = [
+    ("2 vs 5", group_2, group_5),
+    ("2 vs 3", group_2, group_3),
+    ("3 vs 5", group_3, group_5),
+    ("1 vs 6", group_1, group_6),
+    ("4 vs 6", group_4, group_6),
+    ("1 vs 4", group_1, group_4),
+]
+
+results = []
+
+res = pg.rm_anova(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID', detailed=True)
+print(res)
+
+for label, group_a, group_b in comparisons:
+    mw_u_statistic, mw_p_value, mw_z_value, mw_effect_size_r = mann_whitney_test(group_a, group_b)
+    ws_rank_sum_statistic, ws_p_value, ws_effect_size_r = wilcoxon_rank_sum_test(group_a, group_b)
+    
+    results.append({
+        'Comparison': label,
+        'Mann-Whitney U Statistic': mw_u_statistic,
+        'Mann-Whitney p-value': mw_p_value,
+        'Mann-Whitney Z-value': mw_z_value,
+        'Mann-Whitney Effect Size (r)': mw_effect_size_r,
+        'Wilcoxon Rank-Sum Statistic': ws_rank_sum_statistic,
+        'Wilcoxon p-value': ws_p_value,
+        'Wilcoxon Effect Size (r)': ws_effect_size_r
+    })
+
+results_df = pd.DataFrame(results)
+print(results_df)
--- a/gen_completion_time_boxplot.py
+++ b/gen_completion_time_boxplot.py
@ -0,0 +1,60 @@
+import os
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+
+data_dir = './'
+
+all_data = []
+all_tap_logs = []
+
+for filename in os.listdir(data_dir):
+    if filename.endswith('.json'):
+        file_path = os.path.join(data_dir, filename)
+        with open(file_path, encoding='utf-8') as file:
+            data = json.load(file)
+            tap_logs = data.get('sensorLog', {}).get('tapLog', [])
+            for entry in tap_logs:
+                entry['participant_id'] = filename
+                all_tap_logs.append(entry)
+
+df = pd.DataFrame(all_tap_logs)
+
+df['timestamp'] = pd.to_datetime(df['timestamp'])
+
+df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1])
+
+# Mapping with tour_operators being mergeable to study page 2. :)
+path_to_label = {
+    "study-page-1": "Study Page 1",
+    "study-page-2": "Study Page 2",
+    "study-page-3": "Study Page 3",
+    "study-page-4": "Study Page 4",
+    "study-page-5": "Study Page 5",
+    "study-page-6": "Study Page 6",
+    "tour_operators": "Study Page 2"
+}
+
+df['label'] = df['path'].map(path_to_label)
+
+completion_times = df.groupby(['participant_id', 'label'])['timestamp'].agg(['min', 'max']).reset_index()
+completion_times['completion_time'] = (completion_times['max'] - completion_times['min']).dt.total_seconds()
+
+# Filter out technical outliers
+comp_query = 'completion_time < 500'
+
+average_completion_times = completion_times.query(comp_query).groupby('label')['completion_time'].mean().reset_index()
+c_times_by_page = completion_times.query(comp_query).groupby('label', group_keys=True)[['completion_time']].apply(lambda x: x)
+c_times_list = [c_times_by_page.groupby('label').get_group('Study Page 1')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 2')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 3')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 4')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 5')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 6')['completion_time']]
+
+# Draw plots
+plt.figure(figsize=(10, 6))
+#plt.bar(average_completion_times['label'], average_completion_times['completion_time'], color='skyblue')
+plt.boxplot(c_times_list)
+plt.xlabel('Page')
+plt.xticks([1,2,3,4,5,6], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"])
+plt.ylabel('Average Task Completion Time (s)')
+plt.title('Average Task Completion Time by Page')
+plt.xticks(rotation=45, ha='right')
+plt.tight_layout()
+plt.show()
--- a/gen_merged_sensor_data.py
+++ b/gen_merged_sensor_data.py
@ -0,0 +1,66 @@
+import os
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+data_dir = './'
+
+all_data = []
+all_tap_logs = []
+
+for filename in os.listdir(data_dir):
+    if filename.endswith('.json'):
+        file_path = os.path.join(data_dir, filename)
+        with open(file_path, encoding='utf-8') as file:
+            data = json.load(file)
+            tap_logs = data.get('sensorLog', {}).get('tapLog', [])
+            for entry in tap_logs:
+                entry['participant_id'] = filename 
+                all_tap_logs.append(entry)
+
+df = pd.DataFrame(all_tap_logs)
+
+df['timestamp'] = pd.to_datetime(df['timestamp'])
+df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1])
+
+# Mapping with tour_operators being mergeable to study page 2. :)
+path_to_label = {
+    "study-page-1": "Study Page 1",
+    "study-page-2": "Study Page 2",
+    "study-page-3": "Study Page 3",
+    "study-page-4": "Study Page 4",
+    "study-page-5": "Study Page 5",
+    "study-page-6": "Study Page 6",
+    #"tour_operators": "Study Page 2"
+}
+
+df['label'] = df['path'].map(path_to_label)
+
+grouped = df.groupby('label')
+
+# JSON list structure
+def generate_heatmap_data(group):
+    heatmap_data = group[['x', 'y']].copy()
+    heatmap_data['radius'] = 40
+    heatmap_data['value'] = 5
+    heatmap_data['x'] = heatmap_data['x'].astype(str)
+    heatmap_data['y'] = heatmap_data['y'].astype(str)
+    heatmap_data_list = heatmap_data.to_dict(orient='records')
+    
+    min_value = 1
+    max_value = 9999
+    
+    return {
+        "min": min_value,
+        "max": max_value,
+        "data": heatmap_data_list
+    }
+
+for label, group in grouped:
+    heatmap_data = generate_heatmap_data(group.apply(lambda x: round(x)))
+    json_filename = f"{label.replace(' ', '_').lower()}.json"
+    with open(json_filename, 'w', encoding='utf-8') as json_file:
+        json.dump(heatmap_data, json_file, indent=4)
+
+    print(f"Generated {json_filename} with {len(heatmap_data)} records.")
--- a/gen_merged_sensor_data_and_distance_plot.py
+++ b/gen_merged_sensor_data_and_distance_plot.py
@ -0,0 +1,96 @@
+import os
+import json
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+data_dir = './'
+
+all_data = []
+all_tap_logs = []
+
+for filename in os.listdir(data_dir):
+    if filename.endswith('.json'):
+        file_path = os.path.join(data_dir, filename)
+        with open(file_path, encoding='utf-8') as file:
+            data = json.load(file)
+            tap_logs = data.get('sensorLog', {}).get('tapLog', [])
+            for entry in tap_logs:
+                entry['participant_id'] = filename
+                all_tap_logs.append(entry)
+
+df = pd.DataFrame(all_tap_logs)
+
+df['timestamp'] = pd.to_datetime(df['timestamp'])
+
+df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1])
+
+# Mapping with tour_operators being mergeable to study page 2. :)
+path_to_label = {
+    "study-page-1": "Study Page 1",
+    "study-page-2": "Study Page 2",
+    "study-page-3": "Study Page 3",
+    "study-page-4": "Study Page 4",
+    "study-page-5": "Study Page 5",
+    "study-page-6": "Study Page 6",
+    "tour_operators": "Study Page 2"
+}
+
+df['label'] = df['path'].map(path_to_label)
+
+def calculate_distances(group):
+    group = group.sort_values(by='timestamp')
+    x_diff = group['x'].diff().fillna(0)
+    y_diff = group['y'].diff().fillna(0)
+    distances = np.sqrt(x_diff**2 + y_diff**2)
+    total_distance = distances.sum()
+    return total_distance
+
+grouped = df.groupby(['participant_id', 'label'])
+
+distance_data = grouped.apply(calculate_distances).reset_index()
+distance_data.columns = ['participant_id', 'label', 'total_distance']
+
+def generate_heatmap_data(group):
+    heatmap_data = group[['x', 'y']].copy()
+    heatmap_data['radius'] = 40
+    heatmap_data['value'] = 5
+    heatmap_data['x'] = heatmap_data['x'].astype(str)
+    heatmap_data['y'] = heatmap_data['y'].astype(str)
+    
+    heatmap_data_list = heatmap_data.to_dict(orient='records')
+    
+    min_value = 1
+    max_value = 999
+    
+    return {
+        "min": min_value,
+        "max": max_value,
+        "data": heatmap_data_list
+    }
+
+for label, group in df.groupby('label'):
+    heatmap_data = generate_heatmap_data(group)
+    json_filename = f"{label.replace(' ', '_').lower()}.json"
+    with open(json_filename, 'w', encoding='utf-8') as json_file:
+        json.dump(heatmap_data, json_file, indent=4)
+    
+    print(f"Generated {json_filename} with {len(heatmap_data['data'])} records.")
+
+distance_data.to_csv('distance_data.csv', index=False)
+print("Distance data saved to distance_data.csv")
+# Filter out technical outliers...
+comp_query = 'total_distance < 15000'
+
+# Boxplot drawing
+plt.figure(figsize=(12, 6))
+sns.boxplot(x='label', y='total_distance', data=distance_data.query(comp_query).apply(lambda x: x))
+plt.xticks(rotation=45)
+plt.xlabel('Study Page')
+plt.xticks([0,1,2,3,4,5], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"])
+plt.ylabel('Total Distance Traveled (pixels)')
+plt.title('Total Distance Traveled per Study Page')
+plt.tight_layout()
+plt.savefig('distance_boxplot.png')
+plt.show()
--- a/text_merger.py
+++ b/text_merger.py
@ -0,0 +1,25 @@
+import os
+
+main_directory = './'
+
+participant_dirs = range(1, 34)
+
+output_file = os.path.join(main_directory, 'merged_notes.txt')
+
+with open(output_file, 'w') as outfile:
+    for participant in participant_dirs:
+        participant_dir = os.path.join(main_directory, str(participant))
+        
+        if os.path.exists(participant_dir):
+            for filename in os.listdir(participant_dir):
+                if filename.endswith('.txt'):
+                    file_path = os.path.join(participant_dir, filename)
+                    
+                    outfile.write(f"\n\n--- Participant {participant} ---\n\n")
+                    
+                    with open(file_path, 'r') as infile:
+                        outfile.write(infile.read())
+        else:
+            print(f"Directory {participant_dir} does not exist.")
+            
+print("Merging complete. Check the merged_notes.txt file in the main directory.")