-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmovie_web_scraper.py
More file actions
80 lines (56 loc) · 2.34 KB
/
movie_web_scraper.py
File metadata and controls
80 lines (56 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import sys
import pickle
import requests
from bs4 import BeautifulSoup
def movie_scraping_algorithm(link):
# Get access to text with url request and parse Html content on webpage
html_content = requests.get(link).text
soup = BeautifulSoup(html_content, "lxml")
# Find assign table class and table rows
tableData = soup.find("table", attrs={"class": "css-1179hly"})
trs = tableData.find_all('tr')
# Loop through table rows.
# Search for table elemnt with appropriate HTML class name
names = []
attributes = []
for tr in trs:
tds_names = tr.find_all('td', attrs={"class": "css-1u7zfla e126mwsw1"})
tds_ratings = tr.find_all('td', attrs={"class": "css-1u11l3y"})
for td in tds_names:
names.append(td.text)
for td in tds_ratings:
attributes.append(td.text)
return names, attributes
# Group 4 values in movie attribute list
# Grouped Values -- [Year, View discretion, IMDB rating, Reelgood rating]
def groupMovieAttr(attr):
N = 4
attr = [attr[n:n+N] for n in range(0, len(attr), N)]
return attr
# Creating a dictionary of [key, value] = [movie names, list of movie attributes]
def create_dict(key, value):
dict_movie = {}
for name, values in zip(key, value):
dict_movie[name] = values
return dict_movie
# Pickle the movie dictionary
def pickle_data(file):
with open('movies.pickle', 'wb') as handle:
pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(argv):
if not sys.argv[0]:
print("Error 404: File Not Found!")
exit()
# URL - database source
url = 'https://reelgood.com/movies'
movie_names, movie_ratings = movie_scraping_algorithm(url) # Scrape Movie table data and return 2 lists
movie_ratings = groupMovieAttr(movie_ratings) # Calling grouping move attributes function
movie_dict = create_dict(movie_names, movie_ratings) # Creating movie dictionary
pickle_data(movie_dict) # Pickle dictionary
# Pickled data Handling:
# load the pickled movie dictonaries
with open('movies.pickle', 'rb') as handle:
b = pickle.load(handle)
print(movie_dict) # Check if the pickled data is loaded successfully
if __name__ == "__main__":
main(sys.argv)