|
| 1 | +#!/usr/bin/python |
| 2 | +# -*- coding: utf8 -*- |
| 3 | + |
| 4 | +from urllib.request import urlopen |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import bs4 |
| 7 | +import requests |
| 8 | +from datetime import datetime |
| 9 | +import datetime |
| 10 | +import csv |
| 11 | +import time |
| 12 | +import re |
| 13 | +import json |
| 14 | + |
| 15 | +def save_to_file(data, filename): |
| 16 | + text_file = open(filename, "w") |
| 17 | + text_file.write(data) |
| 18 | + text_file.close() |
| 19 | + |
| 20 | +def getlinks (): |
| 21 | + addresses = [] |
| 22 | + for counter in range (0, 45): #when this scraper was written, there were altogether 44 pages |
| 23 | + print ('getting links page ' + str (counter)) |
| 24 | + txt_doc='http://www.entwicklung.at/nc/zahlen-daten-und-fakten/projektliste/?tx_sysfirecdlist_pi1%5Btest%5D=test&tx_sysfirecdlist_pi1%5Bmode%5D=1&tx_sysfirecdlist_pi1%5Bsort%5D=uid%3A1&tx_sysfirecdlist_pi1%5Bpointer%5D=' + str(counter) |
| 25 | + soup = BeautifulSoup(urlopen(txt_doc), 'html.parser') |
| 26 | + for item in soup.find_all ('a'): |
| 27 | + if 'href' in item.attrs: |
| 28 | + if "http://www.entwicklung.at/zahlen-daten-und-fakten/projektliste/?" in item['href']: |
| 29 | + addresses.append (item['href']) |
| 30 | + return addresses |
| 31 | + |
| 32 | +def striphtml (data): |
| 33 | + p = re.compile(r'<.*?>') |
| 34 | + return p.sub('', data) |
| 35 | + |
| 36 | + |
| 37 | +if __name__ == "__main__": |
| 38 | + addresses = getlinks () |
| 39 | + data = [] |
| 40 | + |
| 41 | + l = len (addresses) |
| 42 | + k = 1 |
| 43 | + print ("downloaded %s addresses" % str (l)) |
| 44 | + |
| 45 | + for address in addresses: |
| 46 | + elem = {} |
| 47 | + response = requests.get (address) |
| 48 | + soup2 = bs4.BeautifulSoup (response.text) |
| 49 | + content = soup2.find_all ('div', {'cdcontentdiv'}) |
| 50 | + |
| 51 | + aktualisierung = soup2.find_all ('div', id= 'c3936') |
| 52 | + div = aktualisierung [0].find_all('div') |
| 53 | + date=div[0].contents |
| 54 | + Aktualisierungsdatum = date[0].replace('\xa0', '') |
| 55 | + |
| 56 | + elem["Link"] = address |
| 57 | + elem["Vertragsnummer"] = striphtml (str (content [0])) |
| 58 | + elem["Vertragstitel"]= striphtml (str (content [1])) |
| 59 | + elem["LandRegion"]= striphtml (str (content [2])) |
| 60 | + elem["Vertragspartner"]= striphtml (str (content [3])) |
| 61 | + elem["Vertragssumme"]= striphtml (str (content [4])) |
| 62 | + elem["Beschreibung"]= striphtml (str(content [5])) |
| 63 | + ts = time.time() |
| 64 | + elem["Timestamp"] = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') |
| 65 | + elem["Aktualisierungsdatum"]=Aktualisierungsdatum |
| 66 | + print ("downloaded %s of %s entries" % (k, l)) |
| 67 | + k = k+1 |
| 68 | + |
| 69 | + data.append(elem) |
| 70 | + save_to_file(json.dumps(data, indent=2), 'Entwicklunghilfe.json') |
0 commit comments