|
@@ -0,0 +1,86 @@
|
|
|
+import pandas as pd
|
|
|
+import json
|
|
|
+from datetime import datetime
|
|
|
+import matplotlib.pyplot as plt
|
|
|
+
|
|
|
+
|
|
|
+plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
+plt.rcParams['axes.unicode_minus'] = False
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def load_log_to_df(filepath):
|
|
|
+ data = []
|
|
|
+ with open(filepath, 'r') as file:
|
|
|
+ for line in file:
|
|
|
+ try:
|
|
|
+ timestamp = line[:19]
|
|
|
+ json_str = line[20:]
|
|
|
+ json_data = json.loads(json_str)
|
|
|
+
|
|
|
+ for key, value in json_data.items():
|
|
|
+
|
|
|
+
|
|
|
+ data.append({'uid': key, 'timestamp': timestamp, 'bet_count': value['bet_count'],
|
|
|
+ 'betAmount': value['betAmount']})
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ continue
|
|
|
+ return pd.DataFrame(data)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def parse_by_user_hour_bet():
|
|
|
+
|
|
|
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
|
+ df['hour'] = df['timestamp'].dt.hour
|
|
|
+
|
|
|
+
|
|
|
+ activity_by_hour = df.groupby(['uid', 'hour']).size().unstack(fill_value=0)
|
|
|
+
|
|
|
+ activity_std = activity_by_hour.std(axis=1)
|
|
|
+
|
|
|
+ threshold = activity_std.quantile(0.95)
|
|
|
+ suspected_bots = activity_std[activity_std > threshold].index
|
|
|
+ robot_user = list(suspected_bots)
|
|
|
+ print(f"疑似机器人数量{len(robot_user)}\n用户ID列表: {robot_user}")
|
|
|
+
|
|
|
+ activity_std.hist(bins=30)
|
|
|
+ plt.title('用户活动时间标准差分布')
|
|
|
+ plt.xlabel('标准差')
|
|
|
+ plt.ylabel('用户数量')
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def parse_by_user_activity_time_wide():
|
|
|
+
|
|
|
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
|
+ df['hour'] = df['timestamp'].dt.hour
|
|
|
+
|
|
|
+ active_hours = df['hour'].unique()
|
|
|
+
|
|
|
+ user_hours = df.groupby('uid')['hour'].apply(set)
|
|
|
+
|
|
|
+ user_coverage = user_hours.apply(lambda x: len(x) / len(active_hours))
|
|
|
+
|
|
|
+ threshold = user_coverage.quantile(0.95)
|
|
|
+ suspected_bots = user_coverage[user_coverage > threshold].index
|
|
|
+
|
|
|
+ print(f"疑似机器人数量: {len(suspected_bots)}")
|
|
|
+ print(f"用户ID列表: {list(suspected_bots)}")
|
|
|
+
|
|
|
+ plt.hist(user_coverage, bins=30, alpha=0.7)
|
|
|
+ plt.axvline(x=threshold, color='r', linestyle='--', label='95%分位数阈值')
|
|
|
+ plt.title('用户活动时间覆盖率分布')
|
|
|
+ plt.xlabel('活动时间覆盖率')
|
|
|
+ plt.ylabel('用户数量')
|
|
|
+ plt.legend()
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+
|
|
|
+ df = load_log_to_df('user_bet.log')
|
|
|
+
|
|
|
+
|
|
|
+ parse_by_user_activity_time_wide()
|