Skip to content

Commit 62fdbba

Browse files
Create Scrape_ADC.py
This is the scraper. It can be optimized by looping not over a fixed number of pages, but having it go on until no more data comes in.
1 parent 9008324 commit 62fdbba

1 file changed

Lines changed: 70 additions & 0 deletions

File tree

Scrape_ADC.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf8 -*-
3+
4+
from urllib.request import urlopen
5+
from bs4 import BeautifulSoup
6+
import bs4
7+
import requests
8+
from datetime import datetime
9+
import datetime
10+
import csv
11+
import time
12+
import re
13+
import json
14+
15+
def save_to_file(data, filename):
16+
text_file = open(filename, "w")
17+
text_file.write(data)
18+
text_file.close()
19+
20+
def getlinks ():
21+
addresses = []
22+
for counter in range (0, 45): #when this scraper was written, there were altogether 44 pages
23+
print ('getting links page ' + str (counter))
24+
txt_doc='http://www.entwicklung.at/nc/zahlen-daten-und-fakten/projektliste/?tx_sysfirecdlist_pi1%5Btest%5D=test&tx_sysfirecdlist_pi1%5Bmode%5D=1&tx_sysfirecdlist_pi1%5Bsort%5D=uid%3A1&tx_sysfirecdlist_pi1%5Bpointer%5D=' + str(counter)
25+
soup = BeautifulSoup(urlopen(txt_doc), 'html.parser')
26+
for item in soup.find_all ('a'):
27+
if 'href' in item.attrs:
28+
if "http://www.entwicklung.at/zahlen-daten-und-fakten/projektliste/?" in item['href']:
29+
addresses.append (item['href'])
30+
return addresses
31+
32+
def striphtml (data):
33+
p = re.compile(r'<.*?>')
34+
return p.sub('', data)
35+
36+
37+
if __name__ == "__main__":
38+
addresses = getlinks ()
39+
data = []
40+
41+
l = len (addresses)
42+
k = 1
43+
print ("downloaded %s addresses" % str (l))
44+
45+
for address in addresses:
46+
elem = {}
47+
response = requests.get (address)
48+
soup2 = bs4.BeautifulSoup (response.text)
49+
content = soup2.find_all ('div', {'cdcontentdiv'})
50+
51+
aktualisierung = soup2.find_all ('div', id= 'c3936')
52+
div = aktualisierung [0].find_all('div')
53+
date=div[0].contents
54+
Aktualisierungsdatum = date[0].replace('\xa0', '')
55+
56+
elem["Link"] = address
57+
elem["Vertragsnummer"] = striphtml (str (content [0]))
58+
elem["Vertragstitel"]= striphtml (str (content [1]))
59+
elem["LandRegion"]= striphtml (str (content [2]))
60+
elem["Vertragspartner"]= striphtml (str (content [3]))
61+
elem["Vertragssumme"]= striphtml (str (content [4]))
62+
elem["Beschreibung"]= striphtml (str(content [5]))
63+
ts = time.time()
64+
elem["Timestamp"] = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
65+
elem["Aktualisierungsdatum"]=Aktualisierungsdatum
66+
print ("downloaded %s of %s entries" % (k, l))
67+
k = k+1
68+
69+
data.append(elem)
70+
save_to_file(json.dumps(data, indent=2), 'Entwicklunghilfe.json')

0 commit comments

Comments
 (0)