-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
73 lines (58 loc) · 1.91 KB
/
Copy pathmain.py
File metadata and controls
73 lines (58 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import urllib
import re
import mechanize
from bs4 import BeautifulSoup
import urlparse
import couchdb
#importing couchdb
#url = raw_input('Enter Url \n')
url = ["http://issc.unipune.ac.in"]
visited = [url]
couch = couchdb.Server() #connection with server
del couch['test'] #deleting existing database
db = couch.create('test') # newly created
i=0
while i<len(url):
#Mechanize
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Firefox')]
br.open(url[i])
#BeautifulSoup
htmlcontent = urllib.urlopen(url[i])
soup = BeautifulSoup(htmlcontent)
levelLinks = []
linkText = []
imageLinks = []
imageAlt = []
for link in br.links(text_regex=re.compile('^((?!IMG).)*$')):
newurl = urlparse.urljoin(link.base_url, link.url)
b1 = urlparse.urlparse(newurl).hostname
b2 = urlparse.urlparse(newurl).path
wholeLink = "http://"+b1+b2
linkTxt = link.text
linkText.append(linkTxt)
levelLinks.append(wholeLink)
for linkwimg in soup.find_all('a'):
imgSource = linkwimg.find('img')
if linkwimg.find('img',alt=True):
imgLink = linkwimg['href']
imageLinks.append(imgLink)
imgAlt = linkwimg.img['alt']
imageAlt.append(imgAlt)
elif linkwimg.find('img',alt=False):
imgLink = linkwimg['href']
imageLinks.append(imgLink)
imgAlt = ['No Alt']
imageAlt.append(imgAlt)
print "\n\n\n\nLinks and Text for "+b1+":\n\n"
#get Mechanize Links
for l,lt in zip(levelLinks,linkText):
print l,"\n",lt,"\n"
doc = {'_id': lt,'link': l} #To insert new record
db.save(doc) #To save document
print "\n\n\n\nImage links and alt for "+b1+":\n\n"
#get BeautifulSoup image Links & alt content
for il,ia in zip(imageLinks,imageAlt):
print il,"\n",ia,"\n"
i+=1