1 year ago · 3f7ddd7780
--- a/yyyy_js/python-server/data_analysis.py
+++ b/yyyy_js/python-server/data_analysis.py
@@ -0,0 +1,86 @@
 
				+import pandas as pd
			
 
				+import json
			
 
				+from datetime import datetime
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+# 设置Matplotlib的默认字体
			
 
				+plt.rcParams['font.sans-serif'] = ['SimHei']  # 'SimHei' 是一种支持中文的字体
			
 
				+plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
			
 
				+
			
 
				+
			
 
				+# 读取和解析日志文件
			
 
				+def load_log_to_df(filepath):
			
 
				+    data = []
			
 
				+    with open(filepath, 'r') as file:
			
 
				+        for line in file:
			
 
				+            try:
			
 
				+                timestamp = line[:19]
			
 
				+                json_str = line[20:]
			
 
				+                json_data = json.loads(json_str)
			
 
				+                # json_data 的所有key-value
			
 
				+                for key, value in json_data.items():
			
 
				+                    # print(f"key: {key}, value: {value}")
			
 
				+                    # 假设每条记录都包含uid和timestamp，你可能需要根据实际记录结构调整
			
 
				+                    data.append({'uid': key, 'timestamp': timestamp, 'bet_count': value['bet_count'],
			
 
				+                                 'betAmount': value['betAmount']})
			
 
				+            except json.JSONDecodeError:
			
 
				+                continue
			
 
				+    return pd.DataFrame(data)
			
 
				+
			
 
				+
			
 
				+# 按每小时的下注频率和其他用户的差异来区分机器人用户
			
 
				+def parse_by_user_hour_bet():
			
 
				+    # 将时间戳转换为小时（这里假设timestamp是以某种格式的字符串存储）
			
 
				+    df['timestamp'] = pd.to_datetime(df['timestamp'])
			
 
				+    df['hour'] = df['timestamp'].dt.hour
			
 
				+
			
 
				+    # 计算每个用户在每个小时的活动次数
			
 
				+    activity_by_hour = df.groupby(['uid', 'hour']).size().unstack(fill_value=0)
			
 
				+    # 计算标准差来评估活动的均匀程度
			
 
				+    activity_std = activity_by_hour.std(axis=1)
			
 
				+    # 识别疑似机器人用户
			
 
				+    threshold = activity_std.quantile(0.95)
			
 
				+    suspected_bots = activity_std[activity_std > threshold].index
			
 
				+    robot_user = list(suspected_bots)
			
 
				+    print(f"疑似机器人数量{len(robot_user)}\n用户ID列表: {robot_user}")
			
 
				+    # 可视化
			
 
				+    activity_std.hist(bins=30)
			
 
				+    plt.title('用户活动时间标准差分布')
			
 
				+    plt.xlabel('标准差')
			
 
				+    plt.ylabel('用户数量')
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+#     通过用户活跃时间段来判断
			
 
				+def parse_by_user_activity_time_wide():
			
 
				+    # 将时间戳转换为小时（这里假设timestamp是以某种格式的字符串存储）
			
 
				+    df['timestamp'] = pd.to_datetime(df['timestamp'])
			
 
				+    df['hour'] = df['timestamp'].dt.hour
			
 
				+    # 确定有活动记录的小时
			
 
				+    active_hours = df['hour'].unique()
			
 
				+    # 计算每个用户在哪些小时有活动
			
 
				+    user_hours = df.groupby('uid')['hour'].apply(set)
			
 
				+    # 计算活动时间覆盖率
			
 
				+    user_coverage = user_hours.apply(lambda x: len(x) / len(active_hours))
			
 
				+    # 识别疑似机器人：以活动时间覆盖率的95%分位数为阈值
			
 
				+    threshold = user_coverage.quantile(0.95)
			
 
				+    suspected_bots = user_coverage[user_coverage > threshold].index
			
 
				+    # 打印疑似机器人数量和ID列表
			
 
				+    print(f"疑似机器人数量: {len(suspected_bots)}")
			
 
				+    print(f"用户ID列表: {list(suspected_bots)}")
			
 
				+    # 可视化活动时间覆盖率分布
			
 
				+    plt.hist(user_coverage, bins=30, alpha=0.7)
			
 
				+    plt.axvline(x=threshold, color='r', linestyle='--', label='95%分位数阈值')
			
 
				+    plt.title('用户活动时间覆盖率分布')
			
 
				+    plt.xlabel('活动时间覆盖率')
			
 
				+    plt.ylabel('用户数量')
			
 
				+    plt.legend()
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 将日志文件转换为DataFrame
			
 
				+    df = load_log_to_df('user_bet.log')
			
 
				+    # parse_by_user_hour_bet()
			
 
				+
			
 
				+    parse_by_user_activity_time_wide()