-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecommender.py
More file actions
118 lines (93 loc) · 3.3 KB
/
recommender.py
File metadata and controls
118 lines (93 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import re
import difflib
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import os
import scipy.sparse as sp
import pickle
DATA_PATH = "./data/Processed_Movies.csv"
VECTORS_PATH = "./models/vectors.npz"
EMBEDDINGS_PATH = "./models/sbert_embeddings.npy"
MODEL_PATH = "./models/my_model"
STOPWORDS_PATH = "./models/stopwords.pkl"
# Load stopwords
if os.path.exists(STOPWORDS_PATH):
with open(STOPWORDS_PATH, 'rb') as f:
stop_words = pickle.load(f)
else:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
with open(STOPWORDS_PATH, 'wb') as f:
pickle.dump(stop_words, f)
STEMMER = PorterStemmer()
def preprocessing(self, text):
text = text.lower()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.split()
words = [STEMMER.stem(word) for word in words if word not in stop_words]
return " ".join(words)
# Load data ONCE
DATA = pd.read_csv(DATA_PATH)
TITLES = list(DATA["Titles"])
POSTERS = list(DATA["poster_path"])
# Load BoW vectors
if os.path.exists(VECTORS_PATH):
VECTORS = sp.load_npz(VECTORS_PATH)
else:
tags1 = DATA['tags'].apply(preprocessing)
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(tags1)
sp.save_npz(VECTORS_PATH, vectors)
# Load SentenceTransformer ONCE
if os.path.exists(MODEL_PATH):
MODEL = SentenceTransformer(MODEL_PATH)
else:
MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
MODEL.save(MODEL_PATH)
# Load SBERT embeddings
if os.path.exists(EMBEDDINGS_PATH):
EMBEDDINGS = np.load(EMBEDDINGS_PATH)
else:
tags2 = DATA['tags']
EMBEDDINGS = MODEL.encode(list(tags2))
np.save(EMBEDDINGS_PATH, EMBEDDINGS)
# Precompute title map
TITLE_MAP = {str(title).lower(): idx for idx, title in enumerate(TITLES)}
# ------------------ UTILS ------------------
def poster_url(p):
return f"https://image.tmdb.org/t/p/w500{p}" if isinstance(p, str) else ""
# ------------------ MAIN RECOMMENDER ------------------
def recommend(movie_name, top_k=10):
movie_name = movie_name.lower().strip()
closest = difflib.get_close_matches(movie_name, TITLE_MAP.keys(), n=1)
if not closest:
return None
titles_lower = DATA['Titles'].str.lower()
matched_title = closest[0]
index = titles_lower[titles_lower == matched_title.lower()].index[0]
# BoW similarity
bow_scores = cosine_similarity(VECTORS[index], VECTORS).flatten()
bow_scores = torch.from_numpy(bow_scores)
# Semantic similarity
query_embedding = MODEL.encode(movie_name)
sem_scores = MODEL.similarity(query_embedding, EMBEDDINGS)[0]
# Hybrid score
final_scores = 0.6 * bow_scores + 0.4 * sem_scores
scores, indices = torch.topk(final_scores, k=top_k)
recommendations = []
for score, idx in zip(scores, indices):
recommendations.append({
"title": TITLES[idx],
"poster": poster_url(POSTERS[idx]),
"score": round(score.item(), 4)
})
if len(recommendations) == top_k:
break
return recommendations