-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsqlToDictionary.py
More file actions
36 lines (27 loc) · 784 Bytes
/
sqlToDictionary.py
File metadata and controls
36 lines (27 loc) · 784 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import re
import MySQLdb
db = MySQLdb.connect(host="pow-db.clfpwrv3fbfn.us-west-2.rds.amazonaws.com",
port=4200,user="cedro",
passwd="password",
db="powdb")
def main():
words = []
cursor = db.cursor()
cursor.execute("SELECT * from RawHTML")
for row in cursor.fetchall():
htmlName = row[1].rsplit('/',1)[1]
htmlText = row[2]
htmlWords = re.split('; |, |\*|<>\n', htmlText)
for word in htmlWords:
if len(word) > 30:
continue
word = word.strip()
word = re.sub(r'\W+', '', word)
words.append(word)
wordfile = open("./dictionary.txt", 'w')
words = set(words)
for word in words:
wordfile.write(word + "\n")
if __name__ == "__main__":
main()