From 5f476ae95b75f4c7ce7cf0c6b36bfdb21692cb4e Mon Sep 17 00:00:00 2001 From: Remigiusz Marcinkiewicz Date: Sat, 14 Jan 2017 08:10:24 +0100 Subject: [PATCH] Updated IdeaBank fetcher/parser, it works but requires some refactoring/cleanups --- fetch/banking-ib.py | 164 ++++++++++++++++++++++++------------- fetch/config.py.dist | 8 ++ fetch/fetch.sh | 11 ++- fetch/pip-requirements.txt | 5 ++ 4 files changed, 128 insertions(+), 60 deletions(-) create mode 100644 fetch/config.py.dist create mode 100644 fetch/pip-requirements.txt diff --git a/fetch/banking-ib.py b/fetch/banking-ib.py index fca624c..4350b0b 100644 --- a/fetch/banking-ib.py +++ b/fetch/banking-ib.py @@ -26,46 +26,72 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import csv -from getopt import getopt -import datetime -import re -import hashlib -import requests +from config import CurrentConfig +from datetime import date, datetime +from getopt import getopt, GetoptError +from sqlalchemy import Column, Integer, String, Boolean, Date, create_engine, MetaData +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from sqlalchemy.schema import CreateTable, CreateIndex +from time import time import bs4 -import time -import random +import csv import enum -from datetime import date +import hashlib +import random +import re +import requests +import sys -if not __name__ == "__main__": - from webapp import app -else: - app = type("",(object,),{"config": {"SECRET": "foobar", "OWN_ACCOUNTS": ["PL48195000012006000648890002", "PL21195000012006000648890003", "PL91195000012006000648890004", "PL64195000012006000648890005", "PL45114010100000541244001003"]}})() +config = {key: getattr(CurrentConfig,key) for key in dir(CurrentConfig) if key.isupper()} +Base = declarative_base() + +class RawTransfer(Base): + __tablename__ = 'raw_transfer' + id = Column(Integer, primary_key=True) + raw = Column(String(512)) + uid = Column(String(128), index = True) + on_account = Column(String(32), index = True) + amount = Column(Integer) + currency = Column(String(8)) + date = Column(Date) + type = Column(String(16)) + index = Column(Integer) + + title = Column(String(256)) + + balance = Column(Integer) + balance_currency = Column(String(8)) + + from_account = Column(String(32)) + to_account = Column(String(32)) + + from_name = Column(String(256)) + to_name = Column(String(256)) class IBParseError(Exception): pass -class IBRow(object): - SECRET = app.config["SECRET"] - OWN_ACCOUNTS = app.config["OWN_ACCOUNTS"] +class IBRow(RawTransfer): + SECRET = config["SECRET"] + OWN_ACCOUNTS = config["OWN_ACCOUNTS"] def __unicode__(self): - return u"{} *{} #{} @{} -\"{}\" -#{} => +\"{}\" +#{} [{}.{:02d} {}] ({}.{:02d} {}) ~\"{}\"".format(self.type, self.index, self.current_account, self.time, self.from_name, self.from_account, self.to, self.account, self.amount/100, self.amount%100, self.currency, self.balance/100, self.balance%100, self.balance_currency, self.title) + return u"{} *{} #{} @{} -\"{}\" -#{} => +\"{}\" +#{} [{}.{:02d} {}] ({}.{:02d} {}) ~\"{}\"".format(self.type, self.index, self.on_account, self.date, self.from_name, self.from_account, self.to_name, self.to_account, self.amount/100, self.amount%100, self.currency, self.balance/100, self.balance%100, self.balance_currency, self.title) def __str__(self): return unicode(self).encode("utf-8") def __repr__(self): return str(self) - def __init__(self, row, current_account): - self.raw = row + def __init__(self, row, on_account, raw): + self.raw = raw self.index = 1 - self.current_account = current_account - self.time = datetime.datetime.strptime(row[IBField.date_completed], "%d.%m.%Y").date() - self.account = IBParser.parse_account_number(row[IBField.to_account]) - self.to = row[IBField.to_name] + self.on_account = on_account + self.date = datetime.strptime(row[IBField.date_completed], "%d.%m.%Y").date() + self.to_account = IBParser.parse_account_number(row[IBField.to_account]) + self.to_name = row[IBField.to_name] self.from_account = IBParser.parse_account_number(row[IBField.from_account]) self.from_name = row[IBField.from_name] self.title = row[IBField.title] @@ -84,22 +110,21 @@ class IBRow(object): self.balance = int(a)*100+int(b) self.balance_currency = c - if self.from_account == self.account: + if self.from_account == self.to_account: self.type = "BANK_FEE" - elif self.from_account in self.OWN_ACCOUNTS and self.account in self.OWN_ACCOUNTS: - if self.account == self.current_account: + elif self.from_account in self.OWN_ACCOUNTS and self.to_account in self.OWN_ACCOUNTS: + if self.to_account == self.on_account: self.type = "OUT_FROM_OWN" else: self.type = "OUT_TO_OWN" - elif self.from_account == self.current_account: + elif self.from_account == self.on_account: self.type = "OUT" - elif self.account == self.current_account: + elif self.to_account == self.on_account: self.type = "IN" else: raise IBParseError("Can't figure out transfer type for current row", row) self.uid = hashlib.sha256(self.SECRET + str(self)).hexdigest() - print self.uid class IBField(enum.Enum): from_name = u"Nadawca" @@ -122,7 +147,7 @@ class IBParser(object): c = csv.reader(snapshot.splitlines(), delimiter=";") header = [r.decode("utf-8") for r in next(c, None)] if header is None: - raise IBParseError("No header in history for {}".format(account_number)) + raise IBParseError("No header in history for {}".format(self.account_number)) for hf in header: try: @@ -134,16 +159,16 @@ class IBParser(object): if not len(row) == len(self.fields): raise IBParseError("Row has {} fields, {} expected after parsing the header: \"{}\"".format(len(row), len(self.fields), ';'.join(row))) d = dict(zip(self.fields, [r.decode("utf-8") for r in row])) - r = IBRow(d, account_number) + r = IBRow(d, self.account_number,";".join(row)) self.rows.append(r) - def get_by_type(self, y): - return [row for row in self.rows if row.type == y] + def get(self, type = None, on_account = None): + return [row for row in self.rows if (row.type == type or type is None) and (row.on_account == on_account or on_account is None)] @staticmethod def parse_account_number(s): formats = [ - "((?:[A-Za-z]{2})?[0-9]{2}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4})", # 26 digits, optional country code - Poland + "((?:[A-Za-z]{2})?[0-9]{2})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})", # 26 digits, optional country code - Poland ] for f in formats: m = re.search(f, s) @@ -221,7 +246,7 @@ class IBFetcher(object): def _wait(self, seconds): print "[i] Waiting {} seconds".format(seconds) - #time.sleep(seconds) + time.sleep(seconds) def _gettoken(self, soup): i = soup.find("input", type="hidden", attrs={"name": "banking"}) @@ -243,7 +268,7 @@ class IBFetcher(object): m = re.search("\/main\/index\/token\/([0-9]+)\/time\/", str(soup.head)) if m is not None: t = m.group(1) - r = self._getraw("main/index/token/{}/time/{:.0f}.js".format(t, time.time()*1000), params={"t": "{:.16f}".format(random.random())}) + r = self._getraw("main/index/token/{}/time/{:.0f}.js".format(t, time()*1000), params={"t": "{:.16f}".format(random.random())}) print "[i] Fetched JS timestamp token: \"{}\"".format(r.text) def process_wallet_page(self, soup): @@ -331,30 +356,41 @@ def usage(): if __name__ == "__main__": try: - opts, args = getopt.getopt(sys.argv[1:], "cl:", ["cached", "load="]) - except getopt.GetoptError as err: + opts, args = getopt(sys.argv[1:], "hcl:", ["help", "cached", "load=", "print-schema"]) + except GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) + CACHE_DIR = config["CACHE_DIR"] + engine = create_engine(config["SQLALCHEMY_DATABASE_URI"]) + session = sessionmaker(bind=engine)() + cached = False load_files = {} for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() + elif o in ("--print-schema"): + print "[i] Called with --print-schema, will print the create statement and quit." + m = MetaData() + print CreateTable(IBRow.__table__).compile(engine),";" + for index in IBRow.__table__.indexes: + print CreateIndex(index).compile(engine),";" + sys.exit() elif o in ("-c", "--cached"): cached = True elif o in ("-l", "--load"): - account_number, f = a.split(":") - if account_number is None or f is None: + an, f = a.split(":") + if an is None or f is None: print "[e] --load argument \"{}\" appears malformed, could not split account number and file name".format(a) sys.exit(2) - account_number = IBParser.parse_account_number(account_number) + account_number = IBParser.parse_account_number(an) if account_number is None: - print "[e] Account number \"{}\" unparseable".format(account_number) + print "[e] Account number \"{}\" unparseable".format(an) history = open(f,'r').read() load_files[account_number] = history @@ -363,38 +399,48 @@ if __name__ == "__main__": else: assert False, "unhandled option" - accs = ["PL48195000012006000648890002", "PL21195000012006000648890003", "PL91195000012006000648890004", "PL64195000012006000648890005"] - if cached: print "[i] Cached run - will not connect to the bank" - history_logs = load_files + if len(load_files) > 0: + history_logs = load_files + else: + for account_number in config["IB_ACCOUNTS"]: + print "[e] Automated cache loading not implemented" + sys.exit(2) + else: print "[i] Normal run - will connect to the bank" fetcher = IBFetcher() history_logs = {} - wallet = fetcher.login(raw_input("[?] ID: "), raw_input("[?] Password: ")) + if "IB_LOGIN" not in config.keys() or "IB_PASSWORD" not in config.keys(): + wallet = fetcher.login(raw_input("[?] ID: "), raw_input("[?] Password: ")) + else: + print "[i] Using saved credentials" + wallet = fetcher.login(config["IB_LOGIN"], config["IB_PASSWORD"]) for account_number, account in wallet["accounts"].items(): - if account_number not in accs and False: - print "[i] Skipping {} ({})".format(account_number, account["id"]) - continue print "[i] Fetching history for account {} ({})".format(account_number, account["id"]) history = fetcher.fetch_account_history(account["id"]) - tmp = open(account_number,'w') - tmp.write(history) - tmp.close() + cachefile = open(CACHE_DIR+"/"+account_number,'w') + cachefile.write(history) + cachefile.close() history_logs[account_number] = history parsed = {} + stats = {} for account_number, history in history_logs.items(): print "[i] Parsing history for account {}".format(account_number) parser = IBParser(account_number) parser.parse(history) - parsed[account_number] = parser.rows + stats[account_number] = {} + stats[account_number]["added"] = 0 + stats[account_number]["skipped"] = 0 + for row in parser.get(): + if not session.query(IBRow).filter_by(uid=row.uid).first(): + session.add(row) + stats[account_number]["added"] += 1 + else: + stats[account_number]["skipped"] += 1 + session.commit() - for a,p in parsed.items(): - print "" - print "{}:".format(a) - for e in p: - print "\t{}".format(e) - print "" + print "[i] Done: ", stats #print f.create_report().read() diff --git a/fetch/config.py.dist b/fetch/config.py.dist new file mode 100644 index 0000000..ca8408f --- /dev/null +++ b/fetch/config.py.dist @@ -0,0 +1,8 @@ +class Config(object): + DEBUG = False + TESTING = False + SQLALCHEMY_DATABASE_URI = "sqlite:///data.db" + + +class DevelopmentConfig(Config): + DEBUG = True diff --git a/fetch/fetch.sh b/fetch/fetch.sh index e9ad8ba..c4be25c 100755 --- a/fetch/fetch.sh +++ b/fetch/fetch.sh @@ -1,2 +1,11 @@ #!/bin/sh -echo "$(date): Fetch started." >> fetch.log +K_DIR="$HOME" +K_FETCH_DIR="$K_DIR/fetch/" +K_FETCH_LOG="$K_FETCH_DIR/fetch.log" +K_FETCH_ENV="$K_FETCH_DIR/.env" + +. $K_FETCH_ENV/bin/activate + +echo "Fetch started." | ts >> "$K_FETCH_LOG" + +python "$K_FETCH_DIR/banking-ib.py" 2>&1 | ts | tee -a "$K_FETCH_LOG" diff --git a/fetch/pip-requirements.txt b/fetch/pip-requirements.txt new file mode 100644 index 0000000..a6213b4 --- /dev/null +++ b/fetch/pip-requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4 (4.3.2) +enum34 (1.1.6) +psycopg2 (2.5.4) +requests (2.5.1) +SQLAlchemy (0.9.8)