From b5176fd5585262f11d1c5bc866b1cbe9d9b9fa9b Mon Sep 17 00:00:00 2001 From: nico Date: Thu, 23 May 2019 17:14:14 +0200 Subject: multiple file and gzip file support + add support gzip compressed logs + add support for multiple input files --- .gitignore | 3 ++- main.py | 47 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 8d32868..cce7ba1 100644 --- a/.gitignore +++ b/.gitignore @@ -128,4 +128,5 @@ pip-selfcheck.json # project specific files spam.db -spam-*.txt +config.json +spam-*.txt* diff --git a/main.py b/main.py index 014468b..250cebb 100755 --- a/main.py +++ b/main.py @@ -6,6 +6,8 @@ import sqlite3 import tabulate from defusedxml import ElementTree +import os +import gzip class AbuseReport: @@ -14,8 +16,9 @@ class AbuseReport: def __init__(self, arguments): self.infile = arguments.infile self.domain = arguments.domain - self.conn = sqlite3.connect('spam.db') + self.path = os.path.dirname(__file__) + self.conn = sqlite3.connect("".join([self.path, "/spam.db"])) self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$") self.message_pattern = re.compile(r'', re.DOTALL) @@ -65,16 +68,42 @@ class AbuseReport: def ingest(self): """ - method to ingest xml messages into sqlite database + ingest method to split up the ingest file list + if necessary decompression and decoding are applied """ - try: - with open(self.infile, "r", encoding="utf-8") as spam: - log = re.findall(self.message_pattern, spam.read()) + magic_number = b"\x1f\x8b\x08" + # split up list + for element in self.infile: + + try: + # open file in binary mode + with open(element, "rb") as infile: + content = infile.read() + + except FileNotFoundError as err: + print(err) + + # check file for gzip magic number + # if magic number is present decompress and decode file + if content.startswith(magic_number): + content = gzip.decompress(content).decode("utf-8") + # in any other case read file normally + else: + content = content.decode("utf-8") + + if content is not None: + self.parse(content) + + def parse(self, infile): + """ + method to parse xml messages + :param infile: string containing xml stanzas + """ + log = re.findall(self.message_pattern, infile) + + if log is not None: self.db_import(log) - except FileNotFoundError as err: - print(err) - exit(1) def db_import(self, message_log): """ @@ -117,7 +146,7 @@ class AbuseReport: if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-in', '--infile', help='set path to input file', dest='infile') + parser.add_argument('-in', '--infile', nargs='+', help='set path to input file', dest='infile') parser.add_argument('-d', '--domain', help='specify report domain', dest='domain') args = parser.parse_args() -- cgit v1.2.3-18-g5258