Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7a7d7b5

Browse files
authored
Add files via upload
1 parent 75bb0b5 commit 7a7d7b5

File tree

3 files changed

+110713
-0
lines changed

3 files changed

+110713
-0
lines changed

电影推荐/1.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# coding=utf-8
2+
# Copyright (c) 2020 ichinae.com, Inc. All Rights Reserved
3+
"""
4+
Module Summary Here.
5+
Authors: lijinjun1351@ichinae.com
6+
"""
7+
# -*- coding:utf-8 -*-
8+
"""
9+
author:david
10+
data:2018***
11+
util funcution
12+
"""
13+
import os
14+
15+
16+
def get_item_info(input_file):
17+
'''
18+
得到item的信息
19+
:param input_file:
20+
:return: a dict: key:itemid value:[title,genre]
21+
'''
22+
if not os.path.exists(input_file):
23+
return {}
24+
linenum = 0
25+
item_info = {}
26+
fp = open(input_file, encoding='utf-8')
27+
for line in fp:
28+
if linenum == 0:
29+
linenum += 1
30+
continue
31+
item = line.strip().split(',')
32+
if len(item) < 3: # 观察数据集,发现一般由三列组成,但是有些电影title包含逗号,需要分开讨论
33+
continue # 小于3,则直接过滤
34+
elif len(item) == 3:
35+
itemid, title, genre = item[0], item[1], item[2]
36+
elif len(item) > 3:
37+
itemid = item[0]
38+
genre = item[-1]
39+
title = ','.join(item[1:-1]) # 还原包含逗号被误分割的title
40+
item_info[itemid] = [title, genre]
41+
fp.close()
42+
return item_info
43+
44+
45+
def get_ave_score(input_file):
46+
'''
47+
获取item的平均评分
48+
:param input_file: user rating file
49+
:return:a dict,key:itemid,value:ave_socre
50+
'''
51+
if not os.path.exists(input_file):
52+
return {}
53+
linenum = 0
54+
record_dict = {}
55+
score_dict = {}
56+
fp = open(input_file)
57+
for line in fp:
58+
if linenum == 0:
59+
linenum += 1
60+
continue
61+
item = line.strip().split(',')
62+
if len(item) < 4:
63+
continue
64+
userid, itemid, rating = item[0], item[1], item[2]
65+
if itemid not in record_dict:
66+
record_dict[itemid] = [0, 0]
67+
record_dict[itemid][0] += 1 # 记录出现的次数
68+
record_dict[itemid][1] += float(rating)
69+
fp.close()
70+
for itemid in record_dict:
71+
score_dict[itemid] = round(record_dict[itemid][1] / record_dict[itemid][0], 3) # 精度为3
72+
return score_dict
73+
74+
75+
def get_train_data(input_file):
76+
'''
77+
获得训练样本
78+
:param input_file:
79+
:return: list[user,item,label]
80+
'''
81+
if not os.path.exists(input_file):
82+
return []
83+
score_dict = get_ave_score(input_file)
84+
# 负采样要保证正负样本均衡
85+
pos_dict = {}
86+
neg_dict = {}
87+
train_data = []
88+
score_thr = 4
89+
fp = open(input_file)
90+
linenum = 0
91+
for line in fp:
92+
if linenum == 0:
93+
linenum += 1
94+
continue
95+
item = line.strip().split(',')
96+
if len(item) < 4:
97+
continue
98+
userid, itemid, rating = item[0], item[1], float(item[2])
99+
if userid not in pos_dict:
100+
pos_dict[userid] = []
101+
if userid not in neg_dict:
102+
neg_dict[userid] = []
103+
if rating >= score_thr: # 大于阙值则看作正样本
104+
pos_dict[userid].append((itemid, 1))
105+
else:
106+
score = score_dict.get(itemid, 0)
107+
neg_dict[userid].append((itemid, score))
108+
fp.close()
109+
for userid in pos_dict:
110+
data_num = min(len(pos_dict[userid]), len(neg_dict.get(userid, [])))
111+
if data_num > 0:
112+
train_data += [(userid, temp[0], temp[1]) for temp in pos_dict[userid]][:data_num]
113+
else:
114+
continue
115+
# 对负样本按照平均评分进行排序,element是[itemid,score]
116+
sorted_neg_list = sorted(neg_dict[userid], key=lambda element: element[1], reverse=True)[:data_num]
117+
train_data += [(userid, temp[0], 0) for temp in sorted_neg_list]
118+
return train_data
119+
120+
121+
if __name__ == '__main__':
122+
item_dict = get_item_info("movie.txt")
123+
print(len(item_dict))
124+
print(item_dict["1"])
125+
print(item_dict['11'])
126+
127+
score_dict = get_ave_score("rating.txt")
128+
print(len(score_dict))
129+
print(score_dict['1'])
130+
131+
train_data = get_train_data("rating.txt")
132+
print(len(train_data))
133+

0 commit comments

Comments
 (0)