sofi-statement-parser/parse_statements.py at master · benpetty/sofi-statement-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python3
"""
Because SoFi does not have a way to export transactions, I built this script to
parse their PDF statements
"""

import re
import os
import csv
from datetime import datetime

from tika import unpack


STATEMENTS_FOLDER = os.environ.get("STATEMENTS_FOLDER", "Statements")
TRANSACTIONS_FOLDER = os.environ.get("TRANSACTIONS_FOLDER", "Transactions")

TRANSACTIONS_HEADER = "DATE TYPE DESCRIPTION AMOUNT BALANCE"

output_dirs = None

# Potentially could get any of this data and more from the PDF
# but I just want the transactions
keywords = [
    "Primary Account Holder",
    "Member since",
    "Account Number",
    "Monthly Statement Period",
    "Current Balance",
    "Current Interest Rate",
    "Interest Rate Earned This Period",
    "Monthly Interest Paid",
    "Beginning Balance",
    "APY Earned This Period",
    "Year-to-date Interest Paid",
    "Transaction Details",
    TRANSACTIONS_HEADER,
    "Contact Information",
    "Sweep Program Details",
]


for root, dirs, files in os.walk(STATEMENTS_FOLDER):

    if not output_dirs:

        # Make transaction dirs if they don't exist
        # * I have my statements saved in sub dirs by year so this creates those
        output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs])
        for transaction_dir in output_dirs:
            if not os.path.isdir(transaction_dir):
                os.makedirs(transaction_dir)

    if files:
        for filename in files:
            path = f"{root}/{filename}"
            print("reading file at " + path)
            if os.path.splitext(path)[1] == ".pdf":
                contents = unpack.from_file(path).get("content", "")
                iterator = iter(re.split(f"({'|'.join(keywords)})", contents))

                file_data = []

                for key in iterator:
                    if key in keywords:

                        try:
                            value = next(iterator)

                            if key == TRANSACTIONS_HEADER:

                                # Split by the date format: "Jan 1, 1970"
                                # or 2 new lines
                                split = re.split(
                                    r"(\w{3} \d{1,2}, 20\d{2})|\n\n", value,
                                )

                                # Clean up whitespace and empty strings in list
                                page_data = [
                                    val.replace("\n", " ").strip()
                                    for val in split
                                    if val
                                ]

                                # Convert stream of parsed data to 3 column rows
                                page_data = [
                                    list(entry)
                                    for entry in zip(*[iter(page_data)] * 3)
                                    if entry
                                ]

                                for row in page_data:

                                    # Convert date format
                                    date = datetime.strptime(row[0], "%b %d, %Y")
                                    row[0] = datetime.strftime(date, "%Y/%m/%d")

                                    # Cleanup description
                                    row[1] = " ".join(row[1].split())

                                    # split amount / balance and cleanup
                                    amount, balance = row[2].split()
                                    row[2] = float(
                                        amount.replace("$", "").replace(",", "")
                                    )
                                    row.append(
                                        float(balance.replace("$", "").replace(",", ""))
                                    )
                                    file_data.append(row)

                        except StopIteration:
                            pass

                output_filename = (
                    os.path.splitext(path)[0].replace(
                        STATEMENTS_FOLDER, TRANSACTIONS_FOLDER
                    )
                    + ".csv"
                )

                # Write and read the files
                if not file_data:
                    print("no file data after parsing!")
                else:
                    with open(output_filename, "w") as csv_file:
                        writer = csv.writer(csv_file)
                        writer.writerows(file_data)
                    with open(output_filename, "r") as csv_file:
                        print(output_filename)
                        print(csv_file.read())