#!/usr/bin/env python # Author: Josh Holtrop # Date: 2012-10-28 # Purpose: Convert a HTML "Account Report" output from gnucash to csv format # Usage: gnucash-convert-account-report-to-csv acount.html > acount.csv import os import sys import re def filter_contents(s): while True: new_s = re.sub(r'<[^<>]*>', '', s) if new_s == s: break; s = new_s return s.replace(',', '').replace('$', '').strip() def slurp_row(fh, rows): row = [] contents = '' in_cell = False for line in iter(fh.readline, ''): if re.search(r'<\/tr\s*>', line): rows.append(row) return True m = re.search(r']*>(.*)<\/t[hd]>', line) if m is not None: row.append(filter_contents(m.group(1))) continue m = re.search(r']*>(.*)$', line) if m is not None: in_cell = True contents = m.group(1) continue m = re.match(r'(.*)<\/t[hd]>', line) if m is not None: contents += m.group(1) row.append(filter_contents(contents)) in_cell = False continue if in_cell: contents += line return False def main(argv): if len(argv) < 2: sys.stderr.write("Error: specify input HTML file\n") return -2 rows = [] fh = open(argv[1], 'r') while slurp_row(fh, rows): pass fh.close() for r in rows: if len(r) == 8: sys.stdout.write(','.join(r)) sys.stdout.write('\n') if __name__ == '__main__': sys.exit(main(sys.argv))