From 426ecef4a572b1f9feeb3c62d4bc01e726e77ae7 Mon Sep 17 00:00:00 2001 From: nico Date: Tue, 14 May 2019 18:54:34 +0200 Subject: Initial working commit + python3 ingest and db management + abusereport bash script --- .gitignore | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 38 ++++++++++++++++ main.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 299 insertions(+) create mode 100755 .gitignore create mode 100755 README.md create mode 100755 main.py create mode 100755 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..d99bddd --- /dev/null +++ b/.gitignore @@ -0,0 +1,133 @@ +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +.idea/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +### Python Patch ### +.venv/ + +### Python.VirtualEnv Stack ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +# do not index databases or logfiles +*.log +*.db +abuse* +spam* diff --git a/README.md b/README.md new file mode 100755 index 0000000..10d92d9 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# ejabberd mod_spam_filter ingest + +## installation +Python 3 virtual environment +```bash +virtualenv -p python3 +pip install -r requirements.txt +``` + +## configuration +### ejabberd +`/etc/ejabberd/modules.d/mod_spam_filter.yml` +```yaml +modules: + mod_spam_filter: + ... + spam_dump_file: "/var/log/ejabberd/spam-example.de.txt" + ... +``` + +## usage main.py +``` +usage: main.py [-h] [-in INFILE] [-d DOMAIN] + +optional arguments: + -h, --help show this help message and exit + -in INFILE, --infile INFILE + set path to input file + -d DOMAIN, --domain DOMAIN + specify report domain +``` + +The `--in` argument does only support a single log file at a time. + +## usage abusereport-domain.sh +```bash +./abusereport-domain.sh domain.tld +``` \ No newline at end of file diff --git a/main.py b/main.py new file mode 100755 index 0000000..e86faf4 --- /dev/null +++ b/main.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import re +import sqlite3 +import subprocess + +from defusedxml import ElementTree + + +class AbuseReport: + """Ingestation script for ejabberd spam logs""" + + def __init__(self, arguments): + self.infile = arguments.infile + self.domain = arguments.domain + self.conn = sqlite3.connect('spam.db') + + self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$") + self.message_pattern = re.compile(r'', re.DOTALL) + + def main(self): + """ + method deciding over which action to take + """ + + if self.infile is None: + # infile unset -> report top10 + self.report() + + elif self.infile: + # infile set -> ingest + self.ingest() + + # close sqlite connection + self.conn.close() + + def report(self): + """ + report method + :return: top10 score or domain specific data + """ + # if a specific domain is supplied return only that set + if self.domain is not None: + sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain ' \ + 'FROM spam WHERE domain=\'{}\';"'.format(self.domain) + else: + sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain ' \ + 'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;"' + + print(subprocess.getoutput(sql)) + + def ingest(self): + """ + method to ingest xml messages into sqlite database + """ + try: + with open(self.infile, "r", encoding="utf-8") as spam: + log = re.findall(self.message_pattern, spam.read()) + + self.db_import(log) + except FileNotFoundError as err: + print(err) + exit(1) + + def db_import(self, message_log): + """ + import xml stanzas into database + :param message_log: xml messages + """ + self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT, + PRIMARY KEY("domain","ts"));''') + + for message in message_log: + message_parsed = ElementTree.fromstring(message) + + # parse from tag + spam_from = message_parsed.get('from') + match = self.jid_pattern.match(spam_from) + (node, domain, resource) = match.groups() + + # stamp + spam_time = message_parsed.find('.//{urn:xmpp:delay}delay').get('stamp') + + # body + spam_body = message_parsed.find('{jabber:client}body').text + + # format sql + sql = 'INSERT INTO spam("user", "domain", "ts", "message") VALUES("{}", "{}", "{}", "{}");'.format( + node, domain, spam_time, spam_body + ) + try: + self.conn.execute(sql) + except sqlite3.IntegrityError: + pass + finally: + self.conn.commit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-in', '--infile', help='set path to input file', dest='infile') + parser.add_argument('-d', '--domain', help='specify report domain', dest='domain') + args = parser.parse_args() + + # run + AbuseReport(args).main() + +""" +# Top 10 Domains and their score +SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain' +FROM spam +GROUP BY domain +ORDER BY 1 DESC LIMIT 10; + +# Most frequent messages +SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message +FROM spam +GROUP BY message HAVING bots > 1 +ORDER BY 1 DESC LIMIT 5; + +# report sql +SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain +FROM spam +WHERE domain="default.rs"; +""" diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..20018c9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +defusedxml +slixmpp \ No newline at end of file -- cgit v1.2.3-18-g5258