-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathparse_statements.py
More file actions
executable file
·130 lines (104 loc) · 4.57 KB
/
parse_statements.py
File metadata and controls
executable file
·130 lines (104 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python3
"""
Because SoFi does not have a way to export transactions, I built this script to
parse their PDF statements
"""
import re
import os
import csv
from datetime import datetime
from tika import unpack
STATEMENTS_FOLDER = os.environ.get("STATEMENTS_FOLDER", "Statements")
TRANSACTIONS_FOLDER = os.environ.get("TRANSACTIONS_FOLDER", "Transactions")
TRANSACTIONS_HEADER = "DATE TYPE DESCRIPTION AMOUNT BALANCE"
output_dirs = None
# Potentially could get any of this data and more from the PDF
# but I just want the transactions
keywords = [
"Primary Account Holder",
"Member since",
"Account Number",
"Monthly Statement Period",
"Current Balance",
"Current Interest Rate",
"Interest Rate Earned This Period",
"Monthly Interest Paid",
"Beginning Balance",
"APY Earned This Period",
"Year-to-date Interest Paid",
"Transaction Details",
TRANSACTIONS_HEADER,
"Contact Information",
"Sweep Program Details",
]
for root, dirs, files in os.walk(STATEMENTS_FOLDER):
if not output_dirs:
# Make transaction dirs if they don't exist
# * I have my statements saved in sub dirs by year so this creates those
output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs])
for transaction_dir in output_dirs:
if not os.path.isdir(transaction_dir):
os.makedirs(transaction_dir)
if files:
for filename in files:
path = f"{root}/{filename}"
print("reading file at " + path)
if os.path.splitext(path)[1] == ".pdf":
contents = unpack.from_file(path).get("content", "")
iterator = iter(re.split(f"({'|'.join(keywords)})", contents))
file_data = []
for key in iterator:
if key in keywords:
try:
value = next(iterator)
if key == TRANSACTIONS_HEADER:
# Split by the date format: "Jan 1, 1970"
# or 2 new lines
split = re.split(
r"(\w{3} \d{1,2}, 20\d{2})|\n\n", value,
)
# Clean up whitespace and empty strings in list
page_data = [
val.replace("\n", " ").strip()
for val in split
if val
]
# Convert stream of parsed data to 3 column rows
page_data = [
list(entry)
for entry in zip(*[iter(page_data)] * 3)
if entry
]
for row in page_data:
# Convert date format
date = datetime.strptime(row[0], "%b %d, %Y")
row[0] = datetime.strftime(date, "%Y/%m/%d")
# Cleanup description
row[1] = " ".join(row[1].split())
# split amount / balance and cleanup
amount, balance = row[2].split()
row[2] = float(
amount.replace("$", "").replace(",", "")
)
row.append(
float(balance.replace("$", "").replace(",", ""))
)
file_data.append(row)
except StopIteration:
pass
output_filename = (
os.path.splitext(path)[0].replace(
STATEMENTS_FOLDER, TRANSACTIONS_FOLDER
)
+ ".csv"
)
# Write and read the files
if not file_data:
print("no file data after parsing!")
else:
with open(output_filename, "w") as csv_file:
writer = csv.writer(csv_file)
writer.writerows(file_data)
with open(output_filename, "r") as csv_file:
print(output_filename)
print(csv_file.read())