aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authornico <nico@magicbroccoli.de>2019-05-14 18:54:34 +0200
committernico <nico@magicbroccoli.de>2019-05-14 18:54:34 +0200
commit426ecef4a572b1f9feeb3c62d4bc01e726e77ae7 (patch)
tree54351782b1151b77c49028a32d9106fd3274a371
Initial working commit
+ python3 ingest and db management + abusereport bash script
-rwxr-xr-x.gitignore133
-rwxr-xr-xREADME.md38
-rwxr-xr-xmain.py126
-rwxr-xr-xrequirements.txt2
4 files changed, 299 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..d99bddd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,133 @@
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+.idea/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+### Python Patch ###
+.venv/
+
+### Python.VirtualEnv Stack ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+
+# do not index databases or logfiles
+*.log
+*.db
+abuse*
+spam*
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..10d92d9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,38 @@
+# ejabberd mod_spam_filter ingest
+
+## installation
+Python 3 virtual environment
+```bash
+virtualenv -p python3
+pip install -r requirements.txt
+```
+
+## configuration
+### ejabberd
+`/etc/ejabberd/modules.d/mod_spam_filter.yml`
+```yaml
+modules:
+ mod_spam_filter:
+ ...
+ spam_dump_file: "/var/log/ejabberd/spam-example.de.txt"
+ ...
+```
+
+## usage main.py
+```
+usage: main.py [-h] [-in INFILE] [-d DOMAIN]
+
+optional arguments:
+ -h, --help show this help message and exit
+ -in INFILE, --infile INFILE
+ set path to input file
+ -d DOMAIN, --domain DOMAIN
+ specify report domain
+```
+
+The `--in` argument does only support a single log file at a time.
+
+## usage abusereport-domain.sh
+```bash
+./abusereport-domain.sh domain.tld
+``` \ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..e86faf4
--- /dev/null
+++ b/main.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import re
+import sqlite3
+import subprocess
+
+from defusedxml import ElementTree
+
+
+class AbuseReport:
+ """Ingestation script for ejabberd spam logs"""
+
+ def __init__(self, arguments):
+ self.infile = arguments.infile
+ self.domain = arguments.domain
+ self.conn = sqlite3.connect('spam.db')
+
+ self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$")
+ self.message_pattern = re.compile(r'<message.*?</message>', re.DOTALL)
+
+ def main(self):
+ """
+ method deciding over which action to take
+ """
+
+ if self.infile is None:
+ # infile unset -> report top10
+ self.report()
+
+ elif self.infile:
+ # infile set -> ingest
+ self.ingest()
+
+ # close sqlite connection
+ self.conn.close()
+
+ def report(self):
+ """
+ report method
+ :return: top10 score or domain specific data
+ """
+ # if a specific domain is supplied return only that set
+ if self.domain is not None:
+ sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain ' \
+ 'FROM spam WHERE domain=\'{}\';"'.format(self.domain)
+ else:
+ sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain ' \
+ 'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;"'
+
+ print(subprocess.getoutput(sql))
+
+ def ingest(self):
+ """
+ method to ingest xml messages into sqlite database
+ """
+ try:
+ with open(self.infile, "r", encoding="utf-8") as spam:
+ log = re.findall(self.message_pattern, spam.read())
+
+ self.db_import(log)
+ except FileNotFoundError as err:
+ print(err)
+ exit(1)
+
+ def db_import(self, message_log):
+ """
+ import xml stanzas into database
+ :param message_log: xml messages
+ """
+ self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT,
+ PRIMARY KEY("domain","ts"));''')
+
+ for message in message_log:
+ message_parsed = ElementTree.fromstring(message)
+
+ # parse from tag
+ spam_from = message_parsed.get('from')
+ match = self.jid_pattern.match(spam_from)
+ (node, domain, resource) = match.groups()
+
+ # stamp
+ spam_time = message_parsed.find('.//{urn:xmpp:delay}delay').get('stamp')
+
+ # body
+ spam_body = message_parsed.find('{jabber:client}body').text
+
+ # format sql
+ sql = 'INSERT INTO spam("user", "domain", "ts", "message") VALUES("{}", "{}", "{}", "{}");'.format(
+ node, domain, spam_time, spam_body
+ )
+ try:
+ self.conn.execute(sql)
+ except sqlite3.IntegrityError:
+ pass
+ finally:
+ self.conn.commit()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-in', '--infile', help='set path to input file', dest='infile')
+ parser.add_argument('-d', '--domain', help='specify report domain', dest='domain')
+ args = parser.parse_args()
+
+ # run
+ AbuseReport(args).main()
+
+"""
+# Top 10 Domains and their score
+SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain'
+FROM spam
+GROUP BY domain
+ORDER BY 1 DESC LIMIT 10;
+
+# Most frequent messages
+SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message
+FROM spam
+GROUP BY message HAVING bots > 1
+ORDER BY 1 DESC LIMIT 5;
+
+# report sql
+SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain
+FROM spam
+WHERE domain="default.rs";
+"""
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000..20018c9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+defusedxml
+slixmpp \ No newline at end of file