Updated IdeaBank fetcher/parser, it works but requires some refactoring/cleanups

master
Remigiusz Marcinkiewicz 2017-01-14 08:10:24 +01:00
parent 3a9df521c0
commit 5f476ae95b
4 changed files with 128 additions and 60 deletions

View File

@ -26,46 +26,72 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import csv
from getopt import getopt
import datetime
import re
import hashlib
import requests
from config import CurrentConfig
from datetime import date, datetime
from getopt import getopt, GetoptError
from sqlalchemy import Column, Integer, String, Boolean, Date, create_engine, MetaData
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import CreateTable, CreateIndex
from time import time
import bs4
import time
import random
import csv
import enum
from datetime import date
import hashlib
import random
import re
import requests
import sys
if not __name__ == "__main__":
from webapp import app
else:
app = type("",(object,),{"config": {"SECRET": "foobar", "OWN_ACCOUNTS": ["PL48195000012006000648890002", "PL21195000012006000648890003", "PL91195000012006000648890004", "PL64195000012006000648890005", "PL45114010100000541244001003"]}})()
config = {key: getattr(CurrentConfig,key) for key in dir(CurrentConfig) if key.isupper()}
Base = declarative_base()
class RawTransfer(Base):
__tablename__ = 'raw_transfer'
id = Column(Integer, primary_key=True)
raw = Column(String(512))
uid = Column(String(128), index = True)
on_account = Column(String(32), index = True)
amount = Column(Integer)
currency = Column(String(8))
date = Column(Date)
type = Column(String(16))
index = Column(Integer)
title = Column(String(256))
balance = Column(Integer)
balance_currency = Column(String(8))
from_account = Column(String(32))
to_account = Column(String(32))
from_name = Column(String(256))
to_name = Column(String(256))
class IBParseError(Exception):
pass
class IBRow(object):
SECRET = app.config["SECRET"]
OWN_ACCOUNTS = app.config["OWN_ACCOUNTS"]
class IBRow(RawTransfer):
SECRET = config["SECRET"]
OWN_ACCOUNTS = config["OWN_ACCOUNTS"]
def __unicode__(self):
return u"{} *{} #{} @{} -\"{}\" -#{} => +\"{}\" +#{} [{}.{:02d} {}] ({}.{:02d} {}) ~\"{}\"".format(self.type, self.index, self.current_account, self.time, self.from_name, self.from_account, self.to, self.account, self.amount/100, self.amount%100, self.currency, self.balance/100, self.balance%100, self.balance_currency, self.title)
return u"{} *{} #{} @{} -\"{}\" -#{} => +\"{}\" +#{} [{}.{:02d} {}] ({}.{:02d} {}) ~\"{}\"".format(self.type, self.index, self.on_account, self.date, self.from_name, self.from_account, self.to_name, self.to_account, self.amount/100, self.amount%100, self.currency, self.balance/100, self.balance%100, self.balance_currency, self.title)
def __str__(self):
return unicode(self).encode("utf-8")
def __repr__(self):
return str(self)
def __init__(self, row, current_account):
self.raw = row
def __init__(self, row, on_account, raw):
self.raw = raw
self.index = 1
self.current_account = current_account
self.time = datetime.datetime.strptime(row[IBField.date_completed], "%d.%m.%Y").date()
self.account = IBParser.parse_account_number(row[IBField.to_account])
self.to = row[IBField.to_name]
self.on_account = on_account
self.date = datetime.strptime(row[IBField.date_completed], "%d.%m.%Y").date()
self.to_account = IBParser.parse_account_number(row[IBField.to_account])
self.to_name = row[IBField.to_name]
self.from_account = IBParser.parse_account_number(row[IBField.from_account])
self.from_name = row[IBField.from_name]
self.title = row[IBField.title]
@ -84,22 +110,21 @@ class IBRow(object):
self.balance = int(a)*100+int(b)
self.balance_currency = c
if self.from_account == self.account:
if self.from_account == self.to_account:
self.type = "BANK_FEE"
elif self.from_account in self.OWN_ACCOUNTS and self.account in self.OWN_ACCOUNTS:
if self.account == self.current_account:
elif self.from_account in self.OWN_ACCOUNTS and self.to_account in self.OWN_ACCOUNTS:
if self.to_account == self.on_account:
self.type = "OUT_FROM_OWN"
else:
self.type = "OUT_TO_OWN"
elif self.from_account == self.current_account:
elif self.from_account == self.on_account:
self.type = "OUT"
elif self.account == self.current_account:
elif self.to_account == self.on_account:
self.type = "IN"
else:
raise IBParseError("Can't figure out transfer type for current row", row)
self.uid = hashlib.sha256(self.SECRET + str(self)).hexdigest()
print self.uid
class IBField(enum.Enum):
from_name = u"Nadawca"
@ -122,7 +147,7 @@ class IBParser(object):
c = csv.reader(snapshot.splitlines(), delimiter=";")
header = [r.decode("utf-8") for r in next(c, None)]
if header is None:
raise IBParseError("No header in history for {}".format(account_number))
raise IBParseError("No header in history for {}".format(self.account_number))
for hf in header:
try:
@ -134,16 +159,16 @@ class IBParser(object):
if not len(row) == len(self.fields):
raise IBParseError("Row has {} fields, {} expected after parsing the header: \"{}\"".format(len(row), len(self.fields), ';'.join(row)))
d = dict(zip(self.fields, [r.decode("utf-8") for r in row]))
r = IBRow(d, account_number)
r = IBRow(d, self.account_number,";".join(row))
self.rows.append(r)
def get_by_type(self, y):
return [row for row in self.rows if row.type == y]
def get(self, type = None, on_account = None):
return [row for row in self.rows if (row.type == type or type is None) and (row.on_account == on_account or on_account is None)]
@staticmethod
def parse_account_number(s):
formats = [
"((?:[A-Za-z]{2})?[0-9]{2}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4}) ([0-9]{4})", # 26 digits, optional country code - Poland
"((?:[A-Za-z]{2})?[0-9]{2})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})[ ]?([0-9]{4})", # 26 digits, optional country code - Poland
]
for f in formats:
m = re.search(f, s)
@ -221,7 +246,7 @@ class IBFetcher(object):
def _wait(self, seconds):
print "[i] Waiting {} seconds".format(seconds)
#time.sleep(seconds)
time.sleep(seconds)
def _gettoken(self, soup):
i = soup.find("input", type="hidden", attrs={"name": "banking"})
@ -243,7 +268,7 @@ class IBFetcher(object):
m = re.search("\/main\/index\/token\/([0-9]+)\/time\/", str(soup.head))
if m is not None:
t = m.group(1)
r = self._getraw("main/index/token/{}/time/{:.0f}.js".format(t, time.time()*1000), params={"t": "{:.16f}".format(random.random())})
r = self._getraw("main/index/token/{}/time/{:.0f}.js".format(t, time()*1000), params={"t": "{:.16f}".format(random.random())})
print "[i] Fetched JS timestamp token: \"{}\"".format(r.text)
def process_wallet_page(self, soup):
@ -331,30 +356,41 @@ def usage():
if __name__ == "__main__":
try:
opts, args = getopt.getopt(sys.argv[1:], "cl:", ["cached", "load="])
except getopt.GetoptError as err:
opts, args = getopt(sys.argv[1:], "hcl:", ["help", "cached", "load=", "print-schema"])
except GetoptError as err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
CACHE_DIR = config["CACHE_DIR"]
engine = create_engine(config["SQLALCHEMY_DATABASE_URI"])
session = sessionmaker(bind=engine)()
cached = False
load_files = {}
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("--print-schema"):
print "[i] Called with --print-schema, will print the create statement and quit."
m = MetaData()
print CreateTable(IBRow.__table__).compile(engine),";"
for index in IBRow.__table__.indexes:
print CreateIndex(index).compile(engine),";"
sys.exit()
elif o in ("-c", "--cached"):
cached = True
elif o in ("-l", "--load"):
account_number, f = a.split(":")
if account_number is None or f is None:
an, f = a.split(":")
if an is None or f is None:
print "[e] --load argument \"{}\" appears malformed, could not split account number and file name".format(a)
sys.exit(2)
account_number = IBParser.parse_account_number(account_number)
account_number = IBParser.parse_account_number(an)
if account_number is None:
print "[e] Account number \"{}\" unparseable".format(account_number)
print "[e] Account number \"{}\" unparseable".format(an)
history = open(f,'r').read()
load_files[account_number] = history
@ -363,38 +399,48 @@ if __name__ == "__main__":
else:
assert False, "unhandled option"
accs = ["PL48195000012006000648890002", "PL21195000012006000648890003", "PL91195000012006000648890004", "PL64195000012006000648890005"]
if cached:
print "[i] Cached run - will not connect to the bank"
history_logs = load_files
if len(load_files) > 0:
history_logs = load_files
else:
for account_number in config["IB_ACCOUNTS"]:
print "[e] Automated cache loading not implemented"
sys.exit(2)
else:
print "[i] Normal run - will connect to the bank"
fetcher = IBFetcher()
history_logs = {}
wallet = fetcher.login(raw_input("[?] ID: "), raw_input("[?] Password: "))
if "IB_LOGIN" not in config.keys() or "IB_PASSWORD" not in config.keys():
wallet = fetcher.login(raw_input("[?] ID: "), raw_input("[?] Password: "))
else:
print "[i] Using saved credentials"
wallet = fetcher.login(config["IB_LOGIN"], config["IB_PASSWORD"])
for account_number, account in wallet["accounts"].items():
if account_number not in accs and False:
print "[i] Skipping {} ({})".format(account_number, account["id"])
continue
print "[i] Fetching history for account {} ({})".format(account_number, account["id"])
history = fetcher.fetch_account_history(account["id"])
tmp = open(account_number,'w')
tmp.write(history)
tmp.close()
cachefile = open(CACHE_DIR+"/"+account_number,'w')
cachefile.write(history)
cachefile.close()
history_logs[account_number] = history
parsed = {}
stats = {}
for account_number, history in history_logs.items():
print "[i] Parsing history for account {}".format(account_number)
parser = IBParser(account_number)
parser.parse(history)
parsed[account_number] = parser.rows
stats[account_number] = {}
stats[account_number]["added"] = 0
stats[account_number]["skipped"] = 0
for row in parser.get():
if not session.query(IBRow).filter_by(uid=row.uid).first():
session.add(row)
stats[account_number]["added"] += 1
else:
stats[account_number]["skipped"] += 1
session.commit()
for a,p in parsed.items():
print ""
print "{}:".format(a)
for e in p:
print "\t{}".format(e)
print ""
print "[i] Done: ", stats
#print f.create_report().read()

8
fetch/config.py.dist Normal file
View File

@ -0,0 +1,8 @@
class Config(object):
DEBUG = False
TESTING = False
SQLALCHEMY_DATABASE_URI = "sqlite:///data.db"
class DevelopmentConfig(Config):
DEBUG = True

View File

@ -1,2 +1,11 @@
#!/bin/sh
echo "$(date): Fetch started." >> fetch.log
K_DIR="$HOME"
K_FETCH_DIR="$K_DIR/fetch/"
K_FETCH_LOG="$K_FETCH_DIR/fetch.log"
K_FETCH_ENV="$K_FETCH_DIR/.env"
. $K_FETCH_ENV/bin/activate
echo "Fetch started." | ts >> "$K_FETCH_LOG"
python "$K_FETCH_DIR/banking-ib.py" 2>&1 | ts | tee -a "$K_FETCH_LOG"

View File

@ -0,0 +1,5 @@
beautifulsoup4 (4.3.2)
enum34 (1.1.6)
psycopg2 (2.5.4)
requests (2.5.1)
SQLAlchemy (0.9.8)