-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbarcode.py
More file actions
99 lines (85 loc) · 3.51 KB
/
barcode.py
File metadata and controls
99 lines (85 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
import csv
import requests
import json
import datetime
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
session = requests.Session()
session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
})
proxies = [
"163.172.48.109:15002",
"163.172.48.117:15002",
"163.172.48.119:15002",
"163.172.48.121:15002",
"163.172.36.181:15002",
"163.172.36.191:15002",
"62.210.251.228:15002",
"163.172.36.207:15002"
]
def get_detail(detail_url, count):
r = session.get(detail_url, proxies={
"https": proxies[count % 9]
})
soup = BeautifulSoup(r.text, features="html.parser")
if len(soup.findAll('table')) == 0:
return get_detail(detail_url, count + 1)
else:
return soup
def scrape_data(keyword):
csv.register_dialect('myDialect1',
quoting=csv.QUOTE_ALL,
skipinitialspace=True)
date = datetime.datetime.now()
f = open('{}_{}_{}_{}.csv'.format(keyword, date.month, date.day, date.year), 'w')
writer = csv.writer(f, dialect='myDialect1')
writer.writerow(['Name', 'Image', 'UPC-A', 'EAN-13', 'Amazon ASIN', 'Category', 'Brand', 'Model', 'Last Scanned', 'Store Name', 'Price', 'Currency'])
url = 'https://www.barcodespider.com/{}'
r = session.get(url.format(keyword), proxies={
"https": "83.149.70.159:13042"
})
soup = BeautifulSoup(r.text, features="html.parser")
# print(soup)
# print(r.text)
pagecount = len(soup.findAll('li', class_='page-item'))
if pagecount == 0:
pagecount = 1
baseurl = 'https://www.barcodespider.com/search/{}/{}'
print(pagecount)
for page in range(1, pagecount + 1):
print(baseurl.format(keyword, page))
r = session.get(baseurl.format(keyword, page), proxies={
"https": "83.149.70.159:13042"
})
soup = BeautifulSoup(r.text, features="html.parser")
items = soup.findAll('div', class_='UPCdetail')
print(len(items))
for item in items:
detail_url = item.find('a').attrs['href']
data = [ item.find('p').text]
soup = get_detail(detail_url, 0)
data.append(soup.find('div', class_='thumb-image').find('img').attrs['src'])
attr_table = soup.findAll('table')[0]
attrs = attr_table.findAll('tr')
for attr in attrs:
data.append("=\"" + attr.findAll('td')[1].text + "\"")
store_table = soup.findAll('table')[1]
for store in store_table.find('tbody').findAll('tr'):
data.append(store.findAll('td')[0].text)
if store.findAll('td')[2].text[0] == 'c':
data.append(store.findAll('td')[2].text)
data.append('')
else:
data.append(store.findAll('td')[2].text.split(store.findAll('td')[2].text[0])[1].split(' ')[0])
data.append(store.findAll('td')[2].text.split(store.findAll('td')[2].text[0])[1].split(' ')[1])
print(data)
writer.writerow(data)
f.close()
if __name__ == '__main__':
print(sys.argv[1].strip().replace(' ', '-'))
scrape_data(sys.argv[1].strip().replace(' ', '-'))