From 8e7ac358a4d0f2149873304361c73870d06f8e18 Mon Sep 17 00:00:00 2001 From: nico Date: Wed, 29 May 2019 23:04:22 +0200 Subject: initial release report feature Misc + add report directory + add template directory Feature Release + add report feature --report + add basic report template + add feature to use -d/ --domain flag multiple times Optimization + add config.py loading user config * further code optimization * update gitignore file --- .gitignore | 2 +- config.py | 19 +++++ main.py | 196 ++++++++++++++++++++++++++++++++++---------- report/.gitkeep | 0 template/abuse-template.txt | 28 +++++++ 5 files changed, 201 insertions(+), 44 deletions(-) create mode 100644 config.py create mode 100644 report/.gitkeep create mode 100644 template/abuse-template.txt diff --git a/.gitignore b/.gitignore index cce7ba1..bc8cbd5 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,4 @@ pip-selfcheck.json # project specific files spam.db config.json -spam-*.txt* +/report/*.txt diff --git a/config.py b/config.py new file mode 100644 index 0000000..12a41a4 --- /dev/null +++ b/config.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import json + +# try to read config.json if nonexistent create config.json an populate it +try: + with open("config.json", "r", encoding="utf-8") as f: + config = json.load(f) + +except FileNotFoundError: + with open("config.json", "w", encoding="utf-8") as f: + config = { + "name": "", + } + f.write(json.dumps(config)) + + +class Config(object): + """extract secret key to use for the webserver""" + name = config["name"] diff --git a/main.py b/main.py index 250cebb..d6ed2d3 100755 --- a/main.py +++ b/main.py @@ -1,13 +1,17 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse +import datetime as dt +import gzip +import os import re import sqlite3 +import dns.resolver as dns import tabulate from defusedxml import ElementTree -import os -import gzip + +from config import Config class AbuseReport: @@ -16,9 +20,10 @@ class AbuseReport: def __init__(self, arguments): self.infile = arguments.infile self.domain = arguments.domain + self.report = arguments.report self.path = os.path.dirname(__file__) - self.conn = sqlite3.connect("".join([self.path, "/spam.db"])) + self.conn = sqlite3.connect("/".join([self.path, "spam.db"])) self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$") self.message_pattern = re.compile(r'', re.DOTALL) @@ -26,10 +31,9 @@ class AbuseReport: """ method deciding over which action to take """ - if self.infile is None: # infile unset -> report top10 - self.report() + self.egest() elif self.infile: # infile set -> ingest @@ -38,33 +42,45 @@ class AbuseReport: # close sqlite connection self.conn.close() - def report(self): + def egest(self): """ report method :return: top10 score or domain specific data """ - # if a specific domain is supplied return only that set + result = list() + + # if domain is specified return info for that domain if self.domain is not None: - # first and last time seen spam from specified domain - first = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts LIMIT 1", - {"domain": self.domain}).fetchone()[0] - last = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts DESC LIMIT 1", - {"domain": self.domain}).fetchone()[0] + result = list() - print("First seen : {first}\nLast seen : {last}\n".format(first=first, last=last)) + # iterate over all domains supplied + for domain in self.domain: - result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain FROM spam ' - 'WHERE domain=\'{}\';'.format(self.domain)) - else: + query = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain, + MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''', + {"domain": domain}).fetchall() - result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain ' - 'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;') + # ugly tuple list swapping for nicer formatting + temp = list(query[0]) + if temp[2] is None: + temp[2] = domain + query[0] = tuple(temp) - # format data as table - table = tabulate.tabulate(result, headers=["messages", "bots", "domain"], tablefmt="orgtbl") - print(table) + # extend result table + result.extend(query) + # generate report if enabled + if self.report: + self.gen_report(domain, query) + else: + # in any other case return top 10 + result = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain + FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;''') + # format data as table + table = tabulate.tabulate(result, headers=["messages", "bots", "domain","first seen", "last seen"], + tablefmt="orgtbl") + print(table) def ingest(self): """ @@ -73,7 +89,7 @@ class AbuseReport: """ magic_number = b"\x1f\x8b\x08" - # split up list + # iterate over all infile elements for element in self.infile: try: @@ -82,9 +98,9 @@ class AbuseReport: content = infile.read() except FileNotFoundError as err: + content = "" print(err) - # check file for gzip magic number # if magic number is present decompress and decode file if content.startswith(magic_number): content = gzip.decompress(content).decode("utf-8") @@ -92,6 +108,7 @@ class AbuseReport: else: content = content.decode("utf-8") + # automated run None catch if content is not None: self.parse(content) @@ -143,31 +160,124 @@ class AbuseReport: finally: self.conn.commit() + def gen_report(self, domain, query): + try: + # open abuse report template file + with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template: + report_template = template.read() + + except FileNotFoundError as err: + print(err) + exit(1) + + # current date + now = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d") + + # output to report directory + report_filename = "abuse-{domain}-{date}.txt".format(date=now, domain=domain) + jids_filename = "abuse-{domain}-{date}-jids.txt".format(date=now, domain=domain) + logs_filename = "abuse-{domain}-{date}-logs.txt".format(date=now, domain=domain) + + # write report files + with open("/".join([self.path, "report", report_filename]), "w", encoding="utf-8") as report_out: + content = self.report_template(report_template, domain, query) + report_out.write(content) + + with open("/".join([self.path, "report", jids_filename]), "w", encoding="utf-8") as report_out: + content = self.report_jids(domain) + report_out.write(content) + + with open("/".join([self.path, "report", logs_filename]), "w", encoding="utf-8") as report_out: + content = self.report_logs(domain) + report_out.write(content) + + def report_template(self, template, domain, query): + name = Config.name + + # lookup srv and domain info + info = self.srvlookup(domain) + srv = info[0]["host"] + ips = "".join(info[0]["ip"]) + summary = tabulate.tabulate(query, headers=["messages", "bots", "domain","first seen", "last seen"], + tablefmt="orgtbl") + + report_out= template.format(name=name, domain=domain, srv=srv, ips=ips, summary=summary) + + return report_out + + def report_jids(self, domain): + + jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user + ORDER BY 1;''', {"domain": domain}).fetchall() + + return tabulate.tabulate(jids, tablefmt="plain") + + def report_logs(self, domain): + """ + + :param domain: + :return: + """ + logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10) + ||'========================================================================'||char(10)||message||char(10)|| + '========================================================================' FROM spam WHERE domain=:domain + GROUP BY message ORDER BY COUNT(*) DESC LIMIT 10;''', {"domain": domain}).fetchall() + + return tabulate.tabulate(logs, tablefmt="plain") + + def srvlookup(self, domain): + """ + srv lookup method for the domain provided, if no srv record is found the base domain is used + :type domain: str + :param domain: provided domain to query srv records for + :return: sorted list of dictionaries containing host and ip info + """ + # srv + query = '_xmpp-client._tcp.{}'.format(domain) + + try: + srv_records = dns.query(query, 'SRV') + + except (dns.NXDOMAIN, dns.NoAnswer): + # catch NXDOMAIN and NoAnswer tracebacks + srv_records = None + + # extract record + results = list() + + if srv_records is not None: + # extract all available records + for record in srv_records: + info = dict() + + # gather necessary info from srv records + info["host"] = str(record.target).rstrip('.') + info["weight"] = record.weight + info["priority"] = record.priority + info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] + results.append(info) + + # return list sorted by priority and weight + return sorted(results, key=lambda i: (i['priority'], i["weight"])) + + else: + # prevent empty info when srv records are not present + info = dict() + + # gather necessary info from srv records + info["host"] = domain + info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] + results.append(info) + + return results + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-in', '--infile', nargs='+', help='set path to input file', dest='infile') - parser.add_argument('-d', '--domain', help='specify report domain', dest='domain') + parser.add_argument('-d', '--domain', action='append', help='specify report domain', dest='domain') + parser.add_argument('-r', '--report', action='store_true', help='toggle report output to file', dest='report') args = parser.parse_args() # run AbuseReport(args).main() - -""" -# Top 10 Domains and their score -SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain' -FROM spam -GROUP BY domain -ORDER BY 1 DESC LIMIT 10; - -# Most frequent messages -SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message -FROM spam -GROUP BY message HAVING bots > 1 -ORDER BY 1 DESC LIMIT 5; - -# report sql -SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain -FROM spam -WHERE domain="default.rs"; -""" diff --git a/report/.gitkeep b/report/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/template/abuse-template.txt b/template/abuse-template.txt new file mode 100644 index 0000000..fe729bb --- /dev/null +++ b/template/abuse-template.txt @@ -0,0 +1,28 @@ +Subject: XMPP spam report for {domain} / {ips} + +XMPP domain: {domain} +Server: {srv} +Jabber IP: {ips} + +Hi, + +the above mentioned server is used as an open relay to send vast amounts +of XMPP spam to different unrelated servers, such as the server I +administer. + +Spammers are using the In-Band-Registration feature on that server to +create a large number of accounts, and to send mass messages to my +users. + +Please contact the server owner to disable In-Band-Registration, to take +measures against spam relaying or to shut down the XMPP service. + +Also please find attached a list of the bot accounts and an excerpt of +the spam messages sent to my service. + +{summary} + + +Kind regards, + +{name} -- cgit v1.2.3-54-g00ecf From 6b2bf0719324856a0bbacb9b47dea88a7b9199d1 Mon Sep 17 00:00:00 2001 From: nico Date: Thu, 30 May 2019 02:18:45 +0200 Subject: readme update * update README.md to the latest changes --- README.md | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 10d92d9..b11279a 100644 --- a/README.md +++ b/README.md @@ -14,25 +14,48 @@ pip install -r requirements.txt modules: mod_spam_filter: ... - spam_dump_file: "/var/log/ejabberd/spam-example.de.txt" + spam_dump_file: "/var/log/ejabberd/spam-@HOST@.txt" ... ``` ## usage main.py ``` -usage: main.py [-h] [-in INFILE] [-d DOMAIN] +usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r] optional arguments: -h, --help show this help message and exit - -in INFILE, --infile INFILE + -in INFILE [INFILE ...], --infile INFILE [INFILE ...] set path to input file -d DOMAIN, --domain DOMAIN specify report domain + -r, --report toggle report output to file ``` -The `--in` argument does only support a single log file at a time. +#### -in / --infile +The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefor the script is +able to process gzip compressed files and also multiple files at once via shell expansion. -## usage abusereport-domain.sh +##### example +If ejabberd is configured to create multiple spamdump files it is possible to ingest all files at once, following +this example. ```bash -./abusereport-domain.sh domain.tld -``` \ No newline at end of file +$ ./main.py --in /var/log/ejabberd/spam-*.log +``` + +#### -d / --domain +If a domain is specifically defined to be processed, the script will only query the sqlite database for that domain. +It is possible to provide multiple domains at once via multiple `-d` or `--domain` arguments. + +##### example +```bash +$ ./main.py --d example.tld -d example.com + +| messages | bots | domain | first seen | last seen | +|------------+--------+-------------+-----------------------------+-----------------------------| +| 15 | 9 | example.tld | 2019-04-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z | +| 23 | 7 | example.com | 2018-02-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z | +``` + +#### -r / --report +This flag will only take effect if the `-d` or `--domain` argument is used. If that is the case, the script will +automatically gather information about the specified domain and write them to the `report` directory. -- cgit v1.2.3-54-g00ecf From 0d9f4147b08be59e2f6dc98efce24942985a590b Mon Sep 17 00:00:00 2001 From: nico Date: Thu, 30 May 2019 02:35:23 +0200 Subject: unnecessary else * remove unnecessary else clause --- main.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index d6ed2d3..efb3701 100755 --- a/main.py +++ b/main.py @@ -260,16 +260,15 @@ class AbuseReport: # return list sorted by priority and weight return sorted(results, key=lambda i: (i['priority'], i["weight"])) - else: - # prevent empty info when srv records are not present - info = dict() + # prevent empty info when srv records are not present + info = dict() - # gather necessary info from srv records - info["host"] = domain - info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] - results.append(info) + # gather necessary info from srv records + info["host"] = domain + info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] + results.append(info) - return results + return results if __name__ == "__main__": -- cgit v1.2.3-54-g00ecf From 911450c72e83985a8ff8a7f1301184515f4aa351 Mon Sep 17 00:00:00 2001 From: nico Date: Thu, 30 May 2019 02:49:38 +0200 Subject: readme config.py addition * update README to explain config.py --- README.md | 13 +++++++++++++ config.py | 1 - 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b11279a..97c056f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,19 @@ modules: ... ``` +### config.json +The `config.json` file is used to preserve date from possible updates to this script. `config.py` will load `config +.json` to extract the name, which is used to sign the report message with. In the future there might be other things +the `config.json` may contain. + +```json +$ cat config.json +{ + "name": "username" +} +``` + + ## usage main.py ``` usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r] diff --git a/config.py b/config.py index 12a41a4..ff6f9fa 100644 --- a/config.py +++ b/config.py @@ -15,5 +15,4 @@ except FileNotFoundError: class Config(object): - """extract secret key to use for the webserver""" name = config["name"] -- cgit v1.2.3-54-g00ecf From f0940cc6152faec3695301c1f70430fddc38a898 Mon Sep 17 00:00:00 2001 From: nico Date: Fri, 31 May 2019 15:31:21 +0200 Subject: further documentation * add more docstrings to main.py * add README entry for no argument Thanks to @weiss --- README.md | 24 +++++++++++++++++++++++- main.py | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 97c056f..5fca75e 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,30 @@ optional arguments: -r, --report toggle report output to file ``` +#### run with no argument +If `main.py` is run without any arguments attached, then the script will output a "top 10" table showing the amount +of messages/ bots for the most spammy domains in the database. + +##### example +```bash +$./main.py + +| messages | bots | domain | +|------------+--------+---------------| +| 42 | 1 | example.net | +| 17 | 9 | example.rs | +| 7 | 5 | example.cd | +| 5 | 3 | example.de | +| 4 | 4 | example.ru | +| 3 | 1 | example.co.uk | +| 3 | 3 | example.com | +| 3 | 1 | example.net | +| 3 | 1 | example.fr | +| 3 | 1 | example.com | +``` + #### -in / --infile -The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefor the script is +The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefore the script is able to process gzip compressed files and also multiple files at once via shell expansion. ##### example diff --git a/main.py b/main.py index efb3701..7090eb5 100755 --- a/main.py +++ b/main.py @@ -44,8 +44,9 @@ class AbuseReport: def egest(self): """ - report method - :return: top10 score or domain specific data + egest method + if specific domain is supplied return only those results + in any other case return top 10 table """ result = list() @@ -60,7 +61,8 @@ class AbuseReport: MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''', {"domain": domain}).fetchall() - # ugly tuple list swapping for nicer formatting + # if specified domain is not listed yet, the resulting table would miss the domain name + # this ugle tuple 2 list swap prevents this behaviour temp = list(query[0]) if temp[2] is None: temp[2] = domain @@ -115,6 +117,7 @@ class AbuseReport: def parse(self, infile): """ method to parse xml messages + :type infile: str :param infile: string containing xml stanzas """ log = re.findall(self.message_pattern, infile) @@ -125,6 +128,7 @@ class AbuseReport: def db_import(self, message_log): """ import xml stanzas into database + :type infile: str :param message_log: xml messages """ self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT, @@ -161,6 +165,12 @@ class AbuseReport: self.conn.commit() def gen_report(self, domain, query): + """ + method generating the report files + :type domain: str + :param domain: string containing a domain name + :param query: sqlite cursor object containing the query results for the specified domain + """ try: # open abuse report template file with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template: @@ -192,6 +202,15 @@ class AbuseReport: report_out.write(content) def report_template(self, template, domain, query): + """ + method to collect and format the template file to the final abuse report + :type template: str + :type domain: str + :param template: string containing the abuse report template + :param domain: string containing a domain name + :param query: sqlite cursor object containing the query results for the specified domain + :return: string containing the fully formatted abuse report + """ name = Config.name # lookup srv and domain info @@ -206,6 +225,12 @@ class AbuseReport: return report_out def report_jids(self, domain): + """ + method to collect all involved jids from the database + :type domain: str + :param domain: string containing a domain name + :return: formatted string containing the result + """ jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user ORDER BY 1;''', {"domain": domain}).fetchall() @@ -214,9 +239,10 @@ class AbuseReport: def report_logs(self, domain): """ - - :param domain: - :return: + method to collect all messages grouped by frequency + :type domain: str + :param domain: string containing a domain name + :return: formatted string containing the result """ logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10) ||'========================================================================'||char(10)||message||char(10)|| -- cgit v1.2.3-54-g00ecf From 0d7e2f0c7cef6b7a853107bf37d44816244e7749 Mon Sep 17 00:00:00 2001 From: nico Date: Sun, 2 Jun 2019 15:30:54 +0200 Subject: file system clutter + add correct path to config.json to prevent file system clutter --- config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/config.py b/config.py index ff6f9fa..fac5f9b 100644 --- a/config.py +++ b/config.py @@ -1,13 +1,18 @@ # -*- coding: utf-8 -*- import json +import os + +# filepath of the config.json in the project directory +path = os.path.dirname(__file__) +filepath = ("/".join([path, "config.json"])) # try to read config.json if nonexistent create config.json an populate it try: - with open("config.json", "r", encoding="utf-8") as f: + with open(filepath, "r", encoding="utf-8") as f: config = json.load(f) except FileNotFoundError: - with open("config.json", "w", encoding="utf-8") as f: + with open(filepath, "w", encoding="utf-8") as f: config = { "name": "", } -- cgit v1.2.3-54-g00ecf