From a852defa17ebe96a0997580c781f94eed08dcd8c Mon Sep 17 00:00:00 2001 From: D45Hub Date: Fri, 16 Aug 2024 12:18:33 +0200 Subject: [PATCH] Added scripts --- analysis_helper.py | 100 ++++++++++++++++++++ gen_completion_time_boxplot.py | 60 ++++++++++++ gen_merged_sensor_data.py | 66 +++++++++++++ gen_merged_sensor_data_and_distance_plot.py | 96 +++++++++++++++++++ text_merger.py | 25 +++++ 5 files changed, 347 insertions(+) create mode 100644 analysis_helper.py create mode 100644 gen_completion_time_boxplot.py create mode 100644 gen_merged_sensor_data.py create mode 100644 gen_merged_sensor_data_and_distance_plot.py create mode 100644 text_merger.py diff --git a/analysis_helper.py b/analysis_helper.py new file mode 100644 index 0000000..8a88e3a --- /dev/null +++ b/analysis_helper.py @@ -0,0 +1,100 @@ +import pandas as pd +import statsmodels.api as sm +from statsmodels.formula.api import ols +import scipy.stats as stats +from scipy.stats import f_oneway, friedmanchisquare, mannwhitneyu, ranksums +import numpy as np +import pingouin as pg + +# SUS +df = pd.read_csv('QuestionnaireDataSUS.csv') +df['TotalIMIScore'] = ((df['Q1'] - 1) + (5 - df['Q2']) + (df['Q3'] - 1) + (5 - df['Q4']) + (df['Q5'] - 1) + (5 - df['Q6']) + (df['Q7'] - 1) + (5 - df['Q8']) + (df['Q9'] - 1) + (5 - df['Q10'])) * 2.5 + +# IMI +#df = pd.read_csv('QuestionnaireDataIMI.csv') +#df['TotalIMIScore'] = (df['Q1'] + df['Q2'] + (8 - df['Q3']) + (8 - df['Q4']) + df['Q5'] + df['Q6'] + df['Q7']) / 7.0 + +grouped = df.groupby('WebpageID').agg( + mean_IMIScore=('TotalIMIScore', 'mean'), + std_IMIScore=('TotalIMIScore', 'std'), + count=('TotalIMIScore', 'count') +) +grouped['variance_IMIScore'] = grouped['std_IMIScore'] ** 2 + +anova_data = [group['TotalIMIScore'].values for name, group in df.groupby('WebpageID')] +anova_result = f_oneway(*anova_data) +print(f"ANOVA Result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}") + +friedman_data = df.pivot(index='ParticipantID', columns='WebpageID', values='TotalIMIScore').dropna() +spher, W, chisq, dof, pval = pg.sphericity(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID') +gg = pg.epsilon(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID', correction='gg') +print(gg) + +friedman_result = friedmanchisquare(*[friedman_data[col] for col in friedman_data]) +print(f"Friedman Test Result: Chi-square statistic = {friedman_result.statistic}, p-value = {friedman_result.pvalue}") + +model = ols('TotalIMIScore ~ C(WebpageID)', data=df).fit() +anova_table = sm.stats.anova_lm(model, typ=2) +print(anova_table) +print(stats.shapiro(anova_data[0])) +print(stats.shapiro(anova_data[1])) +print(stats.shapiro(anova_data[2])) +print(stats.shapiro(anova_data[3])) +print(stats.shapiro(anova_data[4])) +print(stats.shapiro(anova_data[5])) +print(stats.levene(*anova_data)) +print(spher, round(W, 5), round(chisq, 3), dof, round(pval, 3)) + +group_1 = df[df['WebpageID'] == 1]['TotalIMIScore'] +group_2 = df[df['WebpageID'] == 2]['TotalIMIScore'] +group_3 = df[df['WebpageID'] == 3]['TotalIMIScore'] +group_4 = df[df['WebpageID'] == 4]['TotalIMIScore'] +group_5 = df[df['WebpageID'] == 5]['TotalIMIScore'] +group_6 = df[df['WebpageID'] == 6]['TotalIMIScore'] + +def mann_whitney_test(group_a, group_b): + u_statistic, p_value = mannwhitneyu(group_a, group_b) + n1 = len(group_a) + n2 = len(group_b) + mu_u = n1 * n2 / 2 + sigma_u = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12) + z_value = (u_statistic - mu_u) / sigma_u + effect_size_r = z_value / np.sqrt(n1 + n2) + return u_statistic, p_value, z_value, effect_size_r + +def wilcoxon_rank_sum_test(group_a, group_b): + rank_sum_statistic, p_value = ranksums(group_a, group_b) + effect_size_r = rank_sum_statistic / np.sqrt(len(group_a) + len(group_b)) + return rank_sum_statistic, p_value, effect_size_r + +comparisons = [ + ("2 vs 5", group_2, group_5), + ("2 vs 3", group_2, group_3), + ("3 vs 5", group_3, group_5), + ("1 vs 6", group_1, group_6), + ("4 vs 6", group_4, group_6), + ("1 vs 4", group_1, group_4), +] + +results = [] + +res = pg.rm_anova(data=df, within='WebpageID', dv='TotalIMIScore', subject='ParticipantID', detailed=True) +print(res) + +for label, group_a, group_b in comparisons: + mw_u_statistic, mw_p_value, mw_z_value, mw_effect_size_r = mann_whitney_test(group_a, group_b) + ws_rank_sum_statistic, ws_p_value, ws_effect_size_r = wilcoxon_rank_sum_test(group_a, group_b) + + results.append({ + 'Comparison': label, + 'Mann-Whitney U Statistic': mw_u_statistic, + 'Mann-Whitney p-value': mw_p_value, + 'Mann-Whitney Z-value': mw_z_value, + 'Mann-Whitney Effect Size (r)': mw_effect_size_r, + 'Wilcoxon Rank-Sum Statistic': ws_rank_sum_statistic, + 'Wilcoxon p-value': ws_p_value, + 'Wilcoxon Effect Size (r)': ws_effect_size_r + }) + +results_df = pd.DataFrame(results) +print(results_df) \ No newline at end of file diff --git a/gen_completion_time_boxplot.py b/gen_completion_time_boxplot.py new file mode 100644 index 0000000..f3fce70 --- /dev/null +++ b/gen_completion_time_boxplot.py @@ -0,0 +1,60 @@ +import os +import json +import pandas as pd +import matplotlib.pyplot as plt + +data_dir = './' + +all_data = [] +all_tap_logs = [] + +for filename in os.listdir(data_dir): + if filename.endswith('.json'): + file_path = os.path.join(data_dir, filename) + with open(file_path, encoding='utf-8') as file: + data = json.load(file) + tap_logs = data.get('sensorLog', {}).get('tapLog', []) + for entry in tap_logs: + entry['participant_id'] = filename + all_tap_logs.append(entry) + +df = pd.DataFrame(all_tap_logs) + +df['timestamp'] = pd.to_datetime(df['timestamp']) + +df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1]) + +# Mapping with tour_operators being mergeable to study page 2. :) +path_to_label = { + "study-page-1": "Study Page 1", + "study-page-2": "Study Page 2", + "study-page-3": "Study Page 3", + "study-page-4": "Study Page 4", + "study-page-5": "Study Page 5", + "study-page-6": "Study Page 6", + "tour_operators": "Study Page 2" +} + +df['label'] = df['path'].map(path_to_label) + +completion_times = df.groupby(['participant_id', 'label'])['timestamp'].agg(['min', 'max']).reset_index() +completion_times['completion_time'] = (completion_times['max'] - completion_times['min']).dt.total_seconds() + +# Filter out technical outliers +comp_query = 'completion_time < 500' + +average_completion_times = completion_times.query(comp_query).groupby('label')['completion_time'].mean().reset_index() +c_times_by_page = completion_times.query(comp_query).groupby('label', group_keys=True)[['completion_time']].apply(lambda x: x) +c_times_list = [c_times_by_page.groupby('label').get_group('Study Page 1')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 2')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 3')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 4')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 5')['completion_time'], c_times_by_page.groupby('label').get_group('Study Page 6')['completion_time']] + +# Draw plots +plt.figure(figsize=(10, 6)) +#plt.bar(average_completion_times['label'], average_completion_times['completion_time'], color='skyblue') +plt.boxplot(c_times_list) +plt.xlabel('Page') +plt.xticks([1,2,3,4,5,6], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"]) +plt.ylabel('Average Task Completion Time (s)') +plt.title('Average Task Completion Time by Page') +plt.xticks(rotation=45, ha='right') +plt.tight_layout() +plt.show() \ No newline at end of file diff --git a/gen_merged_sensor_data.py b/gen_merged_sensor_data.py new file mode 100644 index 0000000..08d71be --- /dev/null +++ b/gen_merged_sensor_data.py @@ -0,0 +1,66 @@ +import os +import json +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +data_dir = './' + +all_data = [] +all_tap_logs = [] + +for filename in os.listdir(data_dir): + if filename.endswith('.json'): + file_path = os.path.join(data_dir, filename) + with open(file_path, encoding='utf-8') as file: + data = json.load(file) + tap_logs = data.get('sensorLog', {}).get('tapLog', []) + for entry in tap_logs: + entry['participant_id'] = filename + all_tap_logs.append(entry) + +df = pd.DataFrame(all_tap_logs) + +df['timestamp'] = pd.to_datetime(df['timestamp']) +df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1]) + +# Mapping with tour_operators being mergeable to study page 2. :) +path_to_label = { + "study-page-1": "Study Page 1", + "study-page-2": "Study Page 2", + "study-page-3": "Study Page 3", + "study-page-4": "Study Page 4", + "study-page-5": "Study Page 5", + "study-page-6": "Study Page 6", + #"tour_operators": "Study Page 2" +} + +df['label'] = df['path'].map(path_to_label) + +grouped = df.groupby('label') + +# JSON list structure +def generate_heatmap_data(group): + heatmap_data = group[['x', 'y']].copy() + heatmap_data['radius'] = 40 + heatmap_data['value'] = 5 + heatmap_data['x'] = heatmap_data['x'].astype(str) + heatmap_data['y'] = heatmap_data['y'].astype(str) + heatmap_data_list = heatmap_data.to_dict(orient='records') + + min_value = 1 + max_value = 9999 + + return { + "min": min_value, + "max": max_value, + "data": heatmap_data_list + } + +for label, group in grouped: + heatmap_data = generate_heatmap_data(group.apply(lambda x: round(x))) + json_filename = f"{label.replace(' ', '_').lower()}.json" + with open(json_filename, 'w', encoding='utf-8') as json_file: + json.dump(heatmap_data, json_file, indent=4) + + print(f"Generated {json_filename} with {len(heatmap_data)} records.") \ No newline at end of file diff --git a/gen_merged_sensor_data_and_distance_plot.py b/gen_merged_sensor_data_and_distance_plot.py new file mode 100644 index 0000000..4a87f7c --- /dev/null +++ b/gen_merged_sensor_data_and_distance_plot.py @@ -0,0 +1,96 @@ +import os +import json +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +data_dir = './' + +all_data = [] +all_tap_logs = [] + +for filename in os.listdir(data_dir): + if filename.endswith('.json'): + file_path = os.path.join(data_dir, filename) + with open(file_path, encoding='utf-8') as file: + data = json.load(file) + tap_logs = data.get('sensorLog', {}).get('tapLog', []) + for entry in tap_logs: + entry['participant_id'] = filename + all_tap_logs.append(entry) + +df = pd.DataFrame(all_tap_logs) + +df['timestamp'] = pd.to_datetime(df['timestamp']) + +df['path'] = df['url'].apply(lambda x: x.split('://')[-1].split('/', 1)[-1]) + +# Mapping with tour_operators being mergeable to study page 2. :) +path_to_label = { + "study-page-1": "Study Page 1", + "study-page-2": "Study Page 2", + "study-page-3": "Study Page 3", + "study-page-4": "Study Page 4", + "study-page-5": "Study Page 5", + "study-page-6": "Study Page 6", + "tour_operators": "Study Page 2" +} + +df['label'] = df['path'].map(path_to_label) + +def calculate_distances(group): + group = group.sort_values(by='timestamp') + x_diff = group['x'].diff().fillna(0) + y_diff = group['y'].diff().fillna(0) + distances = np.sqrt(x_diff**2 + y_diff**2) + total_distance = distances.sum() + return total_distance + +grouped = df.groupby(['participant_id', 'label']) + +distance_data = grouped.apply(calculate_distances).reset_index() +distance_data.columns = ['participant_id', 'label', 'total_distance'] + +def generate_heatmap_data(group): + heatmap_data = group[['x', 'y']].copy() + heatmap_data['radius'] = 40 + heatmap_data['value'] = 5 + heatmap_data['x'] = heatmap_data['x'].astype(str) + heatmap_data['y'] = heatmap_data['y'].astype(str) + + heatmap_data_list = heatmap_data.to_dict(orient='records') + + min_value = 1 + max_value = 999 + + return { + "min": min_value, + "max": max_value, + "data": heatmap_data_list + } + +for label, group in df.groupby('label'): + heatmap_data = generate_heatmap_data(group) + json_filename = f"{label.replace(' ', '_').lower()}.json" + with open(json_filename, 'w', encoding='utf-8') as json_file: + json.dump(heatmap_data, json_file, indent=4) + + print(f"Generated {json_filename} with {len(heatmap_data['data'])} records.") + +distance_data.to_csv('distance_data.csv', index=False) +print("Distance data saved to distance_data.csv") +# Filter out technical outliers... +comp_query = 'total_distance < 15000' + +# Boxplot drawing +plt.figure(figsize=(12, 6)) +sns.boxplot(x='label', y='total_distance', data=distance_data.query(comp_query).apply(lambda x: x)) +plt.xticks(rotation=45) +plt.xlabel('Study Page') +plt.xticks([0,1,2,3,4,5], ["1 - BudgetBird", "2 - Hotel", "3 - UVV", "4 - Iceland", "5 - Rental", "6 - QuickDeliver"]) +plt.ylabel('Total Distance Traveled (pixels)') +plt.title('Total Distance Traveled per Study Page') +plt.tight_layout() +plt.savefig('distance_boxplot.png') +plt.show() \ No newline at end of file diff --git a/text_merger.py b/text_merger.py new file mode 100644 index 0000000..6e7256e --- /dev/null +++ b/text_merger.py @@ -0,0 +1,25 @@ +import os + +main_directory = './' + +participant_dirs = range(1, 34) + +output_file = os.path.join(main_directory, 'merged_notes.txt') + +with open(output_file, 'w') as outfile: + for participant in participant_dirs: + participant_dir = os.path.join(main_directory, str(participant)) + + if os.path.exists(participant_dir): + for filename in os.listdir(participant_dir): + if filename.endswith('.txt'): + file_path = os.path.join(participant_dir, filename) + + outfile.write(f"\n\n--- Participant {participant} ---\n\n") + + with open(file_path, 'r') as infile: + outfile.write(infile.read()) + else: + print(f"Directory {participant_dir} does not exist.") + +print("Merging complete. Check the merged_notes.txt file in the main directory.")