-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrosscrl.py
More file actions
84 lines (67 loc) · 2.11 KB
/
Copy pathcrosscrl.py
File metadata and controls
84 lines (67 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import socket, select, string, sys
import sys
from BeautifulSoup import BeautifulSoup
import urllib
import re
import urlparse
import couchdb
#connecting to couchdb server of url_service(server) machine
server = couchdb.Server('http://' + sys.argv[1] + ':5984/')
#connecting to a database named 'url_service'
db = server['url_service']
#for crawling a given link
def crawler(link):
try:
lst=[]
url2 = urllib.urlopen(link)
soup=BeautifulSoup(url2)
fall=soup.findAll('a')
urllist = []
for h in fall:
urllist+=[urlparse.urljoin(str(link),(str(h.get('href'))))]
for t in urllist:
if('#' not in t):
lst.append(t)
crawler_db(lst)
except Exception:
print link
#for adding links generated by crawler to 'unvisited'
def crawler_db(lst):
v = db.get('visited')
links1=[]
for i in lst:
if(i not in links1):
links1.append(i)
u = db.get('unvisited')
unvisit = u['link'] + links1
doc_u = {'_id':u['_id'],'_rev':u['_rev'],'link':unvisit}
db.save(doc_u)
#main function
if __name__ == "__main__":
if(len(sys.argv) < 3) :
print 'invalid argument : it should be hostname port'
sys.exit()
host = sys.argv[1]
port = int(sys.argv[2])
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try :
s.connect((host, port))
except :
print 'Unable to connect'
sys.exit()
while 1:
socket_list = [sys.stdin, s]
# Get the list sockets which are readable
read_sockets, write_sockets, error_sockets = select.select(socket_list , [], [])
for sock in read_sockets:
#incoming message from remote server
if sock == s:
data = sock.recv(99999999)
if not data :
print '\nDisconnected from chat server'
sys.exit()
else :
crawler(data)
s.send("ok")
else :
print ''