aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--README.md72
-rw-r--r--config.py23
-rwxr-xr-xmain.py225
-rw-r--r--report/.gitkeep0
-rw-r--r--template/abuse-template.txt28
6 files changed, 297 insertions, 53 deletions
diff --git a/.gitignore b/.gitignore
index cce7ba1..bc8cbd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,4 @@ pip-selfcheck.json
# project specific files
spam.db
config.json
-spam-*.txt*
+/report/*.txt
diff --git a/README.md b/README.md
index 10d92d9..5fca75e 100644
--- a/README.md
+++ b/README.md
@@ -14,25 +14,83 @@ pip install -r requirements.txt
modules:
mod_spam_filter:
...
- spam_dump_file: "/var/log/ejabberd/spam-example.de.txt"
+ spam_dump_file: "/var/log/ejabberd/spam-@HOST@.txt"
...
```
+### config.json
+The `config.json` file is used to preserve date from possible updates to this script. `config.py` will load `config
+.json` to extract the name, which is used to sign the report message with. In the future there might be other things
+the `config.json` may contain.
+
+```json
+$ cat config.json
+{
+ "name": "username"
+}
+```
+
+
## usage main.py
```
-usage: main.py [-h] [-in INFILE] [-d DOMAIN]
+usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r]
optional arguments:
-h, --help show this help message and exit
- -in INFILE, --infile INFILE
+ -in INFILE [INFILE ...], --infile INFILE [INFILE ...]
set path to input file
-d DOMAIN, --domain DOMAIN
specify report domain
+ -r, --report toggle report output to file
+```
+
+#### run with no argument
+If `main.py` is run without any arguments attached, then the script will output a "top 10" table showing the amount
+of messages/ bots for the most spammy domains in the database.
+
+##### example
+```bash
+$./main.py
+
+| messages | bots | domain |
+|------------+--------+---------------|
+| 42 | 1 | example.net |
+| 17 | 9 | example.rs |
+| 7 | 5 | example.cd |
+| 5 | 3 | example.de |
+| 4 | 4 | example.ru |
+| 3 | 1 | example.co.uk |
+| 3 | 3 | example.com |
+| 3 | 1 | example.net |
+| 3 | 1 | example.fr |
+| 3 | 1 | example.com |
+```
+
+#### -in / --infile
+The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefore the script is
+able to process gzip compressed files and also multiple files at once via shell expansion.
+
+##### example
+If ejabberd is configured to create multiple spamdump files it is possible to ingest all files at once, following
+this example.
+```bash
+$ ./main.py --in /var/log/ejabberd/spam-*.log
```
-The `--in` argument does only support a single log file at a time.
+#### -d / --domain
+If a domain is specifically defined to be processed, the script will only query the sqlite database for that domain.
+It is possible to provide multiple domains at once via multiple `-d` or `--domain` arguments.
-## usage abusereport-domain.sh
+##### example
```bash
-./abusereport-domain.sh domain.tld
-``` \ No newline at end of file
+$ ./main.py --d example.tld -d example.com
+
+| messages | bots | domain | first seen | last seen |
+|------------+--------+-------------+-----------------------------+-----------------------------|
+| 15 | 9 | example.tld | 2019-04-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z |
+| 23 | 7 | example.com | 2018-02-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z |
+```
+
+#### -r / --report
+This flag will only take effect if the `-d` or `--domain` argument is used. If that is the case, the script will
+automatically gather information about the specified domain and write them to the `report` directory.
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..fac5f9b
--- /dev/null
+++ b/config.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+import json
+import os
+
+# filepath of the config.json in the project directory
+path = os.path.dirname(__file__)
+filepath = ("/".join([path, "config.json"]))
+
+# try to read config.json if nonexistent create config.json an populate it
+try:
+ with open(filepath, "r", encoding="utf-8") as f:
+ config = json.load(f)
+
+except FileNotFoundError:
+ with open(filepath, "w", encoding="utf-8") as f:
+ config = {
+ "name": "",
+ }
+ f.write(json.dumps(config))
+
+
+class Config(object):
+ name = config["name"]
diff --git a/main.py b/main.py
index 250cebb..7090eb5 100755
--- a/main.py
+++ b/main.py
@@ -1,13 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
+import datetime as dt
+import gzip
+import os
import re
import sqlite3
+import dns.resolver as dns
import tabulate
from defusedxml import ElementTree
-import os
-import gzip
+
+from config import Config
class AbuseReport:
@@ -16,9 +20,10 @@ class AbuseReport:
def __init__(self, arguments):
self.infile = arguments.infile
self.domain = arguments.domain
+ self.report = arguments.report
self.path = os.path.dirname(__file__)
- self.conn = sqlite3.connect("".join([self.path, "/spam.db"]))
+ self.conn = sqlite3.connect("/".join([self.path, "spam.db"]))
self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$")
self.message_pattern = re.compile(r'<message.*?</message>', re.DOTALL)
@@ -26,10 +31,9 @@ class AbuseReport:
"""
method deciding over which action to take
"""
-
if self.infile is None:
# infile unset -> report top10
- self.report()
+ self.egest()
elif self.infile:
# infile set -> ingest
@@ -38,33 +42,47 @@ class AbuseReport:
# close sqlite connection
self.conn.close()
- def report(self):
+ def egest(self):
"""
- report method
- :return: top10 score or domain specific data
+ egest method
+ if specific domain is supplied return only those results
+ in any other case return top 10 table
"""
- # if a specific domain is supplied return only that set
+ result = list()
+
+ # if domain is specified return info for that domain
if self.domain is not None:
- # first and last time seen spam from specified domain
- first = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts LIMIT 1",
- {"domain": self.domain}).fetchone()[0]
- last = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts DESC LIMIT 1",
- {"domain": self.domain}).fetchone()[0]
+ result = list()
- print("First seen : {first}\nLast seen : {last}\n".format(first=first, last=last))
+ # iterate over all domains supplied
+ for domain in self.domain:
- result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain FROM spam '
- 'WHERE domain=\'{}\';'.format(self.domain))
- else:
+ query = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain,
+ MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''',
+ {"domain": domain}).fetchall()
- result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain '
- 'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;')
+ # if specified domain is not listed yet, the resulting table would miss the domain name
+ # this ugle tuple 2 list swap prevents this behaviour
+ temp = list(query[0])
+ if temp[2] is None:
+ temp[2] = domain
+ query[0] = tuple(temp)
- # format data as table
- table = tabulate.tabulate(result, headers=["messages", "bots", "domain"], tablefmt="orgtbl")
- print(table)
+ # extend result table
+ result.extend(query)
+ # generate report if enabled
+ if self.report:
+ self.gen_report(domain, query)
+ else:
+ # in any other case return top 10
+ result = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain
+ FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;''')
+ # format data as table
+ table = tabulate.tabulate(result, headers=["messages", "bots", "domain","first seen", "last seen"],
+ tablefmt="orgtbl")
+ print(table)
def ingest(self):
"""
@@ -73,7 +91,7 @@ class AbuseReport:
"""
magic_number = b"\x1f\x8b\x08"
- # split up list
+ # iterate over all infile elements
for element in self.infile:
try:
@@ -82,9 +100,9 @@ class AbuseReport:
content = infile.read()
except FileNotFoundError as err:
+ content = ""
print(err)
- # check file for gzip magic number
# if magic number is present decompress and decode file
if content.startswith(magic_number):
content = gzip.decompress(content).decode("utf-8")
@@ -92,12 +110,14 @@ class AbuseReport:
else:
content = content.decode("utf-8")
+ # automated run None catch
if content is not None:
self.parse(content)
def parse(self, infile):
"""
method to parse xml messages
+ :type infile: str
:param infile: string containing xml stanzas
"""
log = re.findall(self.message_pattern, infile)
@@ -108,6 +128,7 @@ class AbuseReport:
def db_import(self, message_log):
"""
import xml stanzas into database
+ :type infile: str
:param message_log: xml messages
"""
self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT,
@@ -143,31 +164,145 @@ class AbuseReport:
finally:
self.conn.commit()
+ def gen_report(self, domain, query):
+ """
+ method generating the report files
+ :type domain: str
+ :param domain: string containing a domain name
+ :param query: sqlite cursor object containing the query results for the specified domain
+ """
+ try:
+ # open abuse report template file
+ with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template:
+ report_template = template.read()
+
+ except FileNotFoundError as err:
+ print(err)
+ exit(1)
+
+ # current date
+ now = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d")
+
+ # output to report directory
+ report_filename = "abuse-{domain}-{date}.txt".format(date=now, domain=domain)
+ jids_filename = "abuse-{domain}-{date}-jids.txt".format(date=now, domain=domain)
+ logs_filename = "abuse-{domain}-{date}-logs.txt".format(date=now, domain=domain)
+
+ # write report files
+ with open("/".join([self.path, "report", report_filename]), "w", encoding="utf-8") as report_out:
+ content = self.report_template(report_template, domain, query)
+ report_out.write(content)
+
+ with open("/".join([self.path, "report", jids_filename]), "w", encoding="utf-8") as report_out:
+ content = self.report_jids(domain)
+ report_out.write(content)
+
+ with open("/".join([self.path, "report", logs_filename]), "w", encoding="utf-8") as report_out:
+ content = self.report_logs(domain)
+ report_out.write(content)
+
+ def report_template(self, template, domain, query):
+ """
+ method to collect and format the template file to the final abuse report
+ :type template: str
+ :type domain: str
+ :param template: string containing the abuse report template
+ :param domain: string containing a domain name
+ :param query: sqlite cursor object containing the query results for the specified domain
+ :return: string containing the fully formatted abuse report
+ """
+ name = Config.name
+
+ # lookup srv and domain info
+ info = self.srvlookup(domain)
+ srv = info[0]["host"]
+ ips = "".join(info[0]["ip"])
+ summary = tabulate.tabulate(query, headers=["messages", "bots", "domain","first seen", "last seen"],
+ tablefmt="orgtbl")
+
+ report_out= template.format(name=name, domain=domain, srv=srv, ips=ips, summary=summary)
+
+ return report_out
+
+ def report_jids(self, domain):
+ """
+ method to collect all involved jids from the database
+ :type domain: str
+ :param domain: string containing a domain name
+ :return: formatted string containing the result
+ """
+
+ jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user
+ ORDER BY 1;''', {"domain": domain}).fetchall()
+
+ return tabulate.tabulate(jids, tablefmt="plain")
+
+ def report_logs(self, domain):
+ """
+ method to collect all messages grouped by frequency
+ :type domain: str
+ :param domain: string containing a domain name
+ :return: formatted string containing the result
+ """
+ logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10)
+ ||'========================================================================'||char(10)||message||char(10)||
+ '========================================================================' FROM spam WHERE domain=:domain
+ GROUP BY message ORDER BY COUNT(*) DESC LIMIT 10;''', {"domain": domain}).fetchall()
+
+ return tabulate.tabulate(logs, tablefmt="plain")
+
+ def srvlookup(self, domain):
+ """
+ srv lookup method for the domain provided, if no srv record is found the base domain is used
+ :type domain: str
+ :param domain: provided domain to query srv records for
+ :return: sorted list of dictionaries containing host and ip info
+ """
+ # srv
+ query = '_xmpp-client._tcp.{}'.format(domain)
+
+ try:
+ srv_records = dns.query(query, 'SRV')
+
+ except (dns.NXDOMAIN, dns.NoAnswer):
+ # catch NXDOMAIN and NoAnswer tracebacks
+ srv_records = None
+
+ # extract record
+ results = list()
+
+ if srv_records is not None:
+ # extract all available records
+ for record in srv_records:
+ info = dict()
+
+ # gather necessary info from srv records
+ info["host"] = str(record.target).rstrip('.')
+ info["weight"] = record.weight
+ info["priority"] = record.priority
+ info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
+ results.append(info)
+
+ # return list sorted by priority and weight
+ return sorted(results, key=lambda i: (i['priority'], i["weight"]))
+
+ # prevent empty info when srv records are not present
+ info = dict()
+
+ # gather necessary info from srv records
+ info["host"] = domain
+ info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
+ results.append(info)
+
+ return results
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-in', '--infile', nargs='+', help='set path to input file', dest='infile')
- parser.add_argument('-d', '--domain', help='specify report domain', dest='domain')
+ parser.add_argument('-d', '--domain', action='append', help='specify report domain', dest='domain')
+ parser.add_argument('-r', '--report', action='store_true', help='toggle report output to file', dest='report')
args = parser.parse_args()
# run
AbuseReport(args).main()
-
-"""
-# Top 10 Domains and their score
-SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain'
-FROM spam
-GROUP BY domain
-ORDER BY 1 DESC LIMIT 10;
-
-# Most frequent messages
-SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message
-FROM spam
-GROUP BY message HAVING bots > 1
-ORDER BY 1 DESC LIMIT 5;
-
-# report sql
-SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain
-FROM spam
-WHERE domain="default.rs";
-"""
diff --git a/report/.gitkeep b/report/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/report/.gitkeep
diff --git a/template/abuse-template.txt b/template/abuse-template.txt
new file mode 100644
index 0000000..fe729bb
--- /dev/null
+++ b/template/abuse-template.txt
@@ -0,0 +1,28 @@
+Subject: XMPP spam report for {domain} / {ips}
+
+XMPP domain: {domain}
+Server: {srv}
+Jabber IP: {ips}
+
+Hi,
+
+the above mentioned server is used as an open relay to send vast amounts
+of XMPP spam to different unrelated servers, such as the server I
+administer.
+
+Spammers are using the In-Band-Registration feature on that server to
+create a large number of accounts, and to send mass messages to my
+users.
+
+Please contact the server owner to disable In-Band-Registration, to take
+measures against spam relaying or to shut down the XMPP service.
+
+Also please find attached a list of the bot accounts and an excerpt of
+the spam messages sent to my service.
+
+{summary}
+
+
+Kind regards,
+
+{name}