diff options
author | nico <nico@magicbroccoli.de> | 2019-06-10 12:49:32 +0200 |
---|---|---|
committer | nico <nico@magicbroccoli.de> | 2019-06-10 12:56:01 +0200 |
commit | 243d48b92d82d15115b8d341649b405a11603c14 (patch) | |
tree | f131604385c6dd5a065d6c5e0ba1ac5c18554cee | |
parent | b5176fd5585262f11d1c5bc866b1cbe9d9b9fa9b (diff) | |
parent | 0d7e2f0c7cef6b7a853107bf37d44816244e7749 (diff) |
Merge branch 'report'0.1.0
Misc
+ add report directory
+ add template directory
+ add config.py loading user config
Feature Release
+ add report feature --report
+ add basic report template
+ add feature to use -d/ --domain flag multiple times
Optimization
* further code optimization
* update gitignore file
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | README.md | 72 | ||||
-rw-r--r-- | config.py | 23 | ||||
-rwxr-xr-x | main.py | 225 | ||||
-rw-r--r-- | report/.gitkeep | 0 | ||||
-rw-r--r-- | template/abuse-template.txt | 28 |
6 files changed, 297 insertions, 53 deletions
@@ -129,4 +129,4 @@ pip-selfcheck.json # project specific files spam.db config.json -spam-*.txt* +/report/*.txt @@ -14,25 +14,83 @@ pip install -r requirements.txt modules: mod_spam_filter: ... - spam_dump_file: "/var/log/ejabberd/spam-example.de.txt" + spam_dump_file: "/var/log/ejabberd/spam-@HOST@.txt" ... ``` +### config.json +The `config.json` file is used to preserve date from possible updates to this script. `config.py` will load `config +.json` to extract the name, which is used to sign the report message with. In the future there might be other things +the `config.json` may contain. + +```json +$ cat config.json +{ + "name": "username" +} +``` + + ## usage main.py ``` -usage: main.py [-h] [-in INFILE] [-d DOMAIN] +usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r] optional arguments: -h, --help show this help message and exit - -in INFILE, --infile INFILE + -in INFILE [INFILE ...], --infile INFILE [INFILE ...] set path to input file -d DOMAIN, --domain DOMAIN specify report domain + -r, --report toggle report output to file +``` + +#### run with no argument +If `main.py` is run without any arguments attached, then the script will output a "top 10" table showing the amount +of messages/ bots for the most spammy domains in the database. + +##### example +```bash +$./main.py + +| messages | bots | domain | +|------------+--------+---------------| +| 42 | 1 | example.net | +| 17 | 9 | example.rs | +| 7 | 5 | example.cd | +| 5 | 3 | example.de | +| 4 | 4 | example.ru | +| 3 | 1 | example.co.uk | +| 3 | 3 | example.com | +| 3 | 1 | example.net | +| 3 | 1 | example.fr | +| 3 | 1 | example.com | +``` + +#### -in / --infile +The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefore the script is +able to process gzip compressed files and also multiple files at once via shell expansion. + +##### example +If ejabberd is configured to create multiple spamdump files it is possible to ingest all files at once, following +this example. +```bash +$ ./main.py --in /var/log/ejabberd/spam-*.log ``` -The `--in` argument does only support a single log file at a time. +#### -d / --domain +If a domain is specifically defined to be processed, the script will only query the sqlite database for that domain. +It is possible to provide multiple domains at once via multiple `-d` or `--domain` arguments. -## usage abusereport-domain.sh +##### example ```bash -./abusereport-domain.sh domain.tld -```
\ No newline at end of file +$ ./main.py --d example.tld -d example.com + +| messages | bots | domain | first seen | last seen | +|------------+--------+-------------+-----------------------------+-----------------------------| +| 15 | 9 | example.tld | 2019-04-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z | +| 23 | 7 | example.com | 2018-02-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z | +``` + +#### -r / --report +This flag will only take effect if the `-d` or `--domain` argument is used. If that is the case, the script will +automatically gather information about the specified domain and write them to the `report` directory. diff --git a/config.py b/config.py new file mode 100644 index 0000000..fac5f9b --- /dev/null +++ b/config.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +import json +import os + +# filepath of the config.json in the project directory +path = os.path.dirname(__file__) +filepath = ("/".join([path, "config.json"])) + +# try to read config.json if nonexistent create config.json an populate it +try: + with open(filepath, "r", encoding="utf-8") as f: + config = json.load(f) + +except FileNotFoundError: + with open(filepath, "w", encoding="utf-8") as f: + config = { + "name": "", + } + f.write(json.dumps(config)) + + +class Config(object): + name = config["name"] @@ -1,13 +1,17 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse +import datetime as dt +import gzip +import os import re import sqlite3 +import dns.resolver as dns import tabulate from defusedxml import ElementTree -import os -import gzip + +from config import Config class AbuseReport: @@ -16,9 +20,10 @@ class AbuseReport: def __init__(self, arguments): self.infile = arguments.infile self.domain = arguments.domain + self.report = arguments.report self.path = os.path.dirname(__file__) - self.conn = sqlite3.connect("".join([self.path, "/spam.db"])) + self.conn = sqlite3.connect("/".join([self.path, "spam.db"])) self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$") self.message_pattern = re.compile(r'<message.*?</message>', re.DOTALL) @@ -26,10 +31,9 @@ class AbuseReport: """ method deciding over which action to take """ - if self.infile is None: # infile unset -> report top10 - self.report() + self.egest() elif self.infile: # infile set -> ingest @@ -38,33 +42,47 @@ class AbuseReport: # close sqlite connection self.conn.close() - def report(self): + def egest(self): """ - report method - :return: top10 score or domain specific data + egest method + if specific domain is supplied return only those results + in any other case return top 10 table """ - # if a specific domain is supplied return only that set + result = list() + + # if domain is specified return info for that domain if self.domain is not None: - # first and last time seen spam from specified domain - first = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts LIMIT 1", - {"domain": self.domain}).fetchone()[0] - last = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts DESC LIMIT 1", - {"domain": self.domain}).fetchone()[0] + result = list() - print("First seen : {first}\nLast seen : {last}\n".format(first=first, last=last)) + # iterate over all domains supplied + for domain in self.domain: - result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain FROM spam ' - 'WHERE domain=\'{}\';'.format(self.domain)) - else: + query = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain, + MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''', + {"domain": domain}).fetchall() - result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain ' - 'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;') + # if specified domain is not listed yet, the resulting table would miss the domain name + # this ugle tuple 2 list swap prevents this behaviour + temp = list(query[0]) + if temp[2] is None: + temp[2] = domain + query[0] = tuple(temp) - # format data as table - table = tabulate.tabulate(result, headers=["messages", "bots", "domain"], tablefmt="orgtbl") - print(table) + # extend result table + result.extend(query) + # generate report if enabled + if self.report: + self.gen_report(domain, query) + else: + # in any other case return top 10 + result = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain + FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;''') + # format data as table + table = tabulate.tabulate(result, headers=["messages", "bots", "domain","first seen", "last seen"], + tablefmt="orgtbl") + print(table) def ingest(self): """ @@ -73,7 +91,7 @@ class AbuseReport: """ magic_number = b"\x1f\x8b\x08" - # split up list + # iterate over all infile elements for element in self.infile: try: @@ -82,9 +100,9 @@ class AbuseReport: content = infile.read() except FileNotFoundError as err: + content = "" print(err) - # check file for gzip magic number # if magic number is present decompress and decode file if content.startswith(magic_number): content = gzip.decompress(content).decode("utf-8") @@ -92,12 +110,14 @@ class AbuseReport: else: content = content.decode("utf-8") + # automated run None catch if content is not None: self.parse(content) def parse(self, infile): """ method to parse xml messages + :type infile: str :param infile: string containing xml stanzas """ log = re.findall(self.message_pattern, infile) @@ -108,6 +128,7 @@ class AbuseReport: def db_import(self, message_log): """ import xml stanzas into database + :type infile: str :param message_log: xml messages """ self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT, @@ -143,31 +164,145 @@ class AbuseReport: finally: self.conn.commit() + def gen_report(self, domain, query): + """ + method generating the report files + :type domain: str + :param domain: string containing a domain name + :param query: sqlite cursor object containing the query results for the specified domain + """ + try: + # open abuse report template file + with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template: + report_template = template.read() + + except FileNotFoundError as err: + print(err) + exit(1) + + # current date + now = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d") + + # output to report directory + report_filename = "abuse-{domain}-{date}.txt".format(date=now, domain=domain) + jids_filename = "abuse-{domain}-{date}-jids.txt".format(date=now, domain=domain) + logs_filename = "abuse-{domain}-{date}-logs.txt".format(date=now, domain=domain) + + # write report files + with open("/".join([self.path, "report", report_filename]), "w", encoding="utf-8") as report_out: + content = self.report_template(report_template, domain, query) + report_out.write(content) + + with open("/".join([self.path, "report", jids_filename]), "w", encoding="utf-8") as report_out: + content = self.report_jids(domain) + report_out.write(content) + + with open("/".join([self.path, "report", logs_filename]), "w", encoding="utf-8") as report_out: + content = self.report_logs(domain) + report_out.write(content) + + def report_template(self, template, domain, query): + """ + method to collect and format the template file to the final abuse report + :type template: str + :type domain: str + :param template: string containing the abuse report template + :param domain: string containing a domain name + :param query: sqlite cursor object containing the query results for the specified domain + :return: string containing the fully formatted abuse report + """ + name = Config.name + + # lookup srv and domain info + info = self.srvlookup(domain) + srv = info[0]["host"] + ips = "".join(info[0]["ip"]) + summary = tabulate.tabulate(query, headers=["messages", "bots", "domain","first seen", "last seen"], + tablefmt="orgtbl") + + report_out= template.format(name=name, domain=domain, srv=srv, ips=ips, summary=summary) + + return report_out + + def report_jids(self, domain): + """ + method to collect all involved jids from the database + :type domain: str + :param domain: string containing a domain name + :return: formatted string containing the result + """ + + jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user + ORDER BY 1;''', {"domain": domain}).fetchall() + + return tabulate.tabulate(jids, tablefmt="plain") + + def report_logs(self, domain): + """ + method to collect all messages grouped by frequency + :type domain: str + :param domain: string containing a domain name + :return: formatted string containing the result + """ + logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10) + ||'========================================================================'||char(10)||message||char(10)|| + '========================================================================' FROM spam WHERE domain=:domain + GROUP BY message ORDER BY COUNT(*) DESC LIMIT 10;''', {"domain": domain}).fetchall() + + return tabulate.tabulate(logs, tablefmt="plain") + + def srvlookup(self, domain): + """ + srv lookup method for the domain provided, if no srv record is found the base domain is used + :type domain: str + :param domain: provided domain to query srv records for + :return: sorted list of dictionaries containing host and ip info + """ + # srv + query = '_xmpp-client._tcp.{}'.format(domain) + + try: + srv_records = dns.query(query, 'SRV') + + except (dns.NXDOMAIN, dns.NoAnswer): + # catch NXDOMAIN and NoAnswer tracebacks + srv_records = None + + # extract record + results = list() + + if srv_records is not None: + # extract all available records + for record in srv_records: + info = dict() + + # gather necessary info from srv records + info["host"] = str(record.target).rstrip('.') + info["weight"] = record.weight + info["priority"] = record.priority + info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] + results.append(info) + + # return list sorted by priority and weight + return sorted(results, key=lambda i: (i['priority'], i["weight"])) + + # prevent empty info when srv records are not present + info = dict() + + # gather necessary info from srv records + info["host"] = domain + info["ip"] = [ip.address for ip in dns.query(info["host"], "A")] + results.append(info) + + return results + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-in', '--infile', nargs='+', help='set path to input file', dest='infile') - parser.add_argument('-d', '--domain', help='specify report domain', dest='domain') + parser.add_argument('-d', '--domain', action='append', help='specify report domain', dest='domain') + parser.add_argument('-r', '--report', action='store_true', help='toggle report output to file', dest='report') args = parser.parse_args() # run AbuseReport(args).main() - -""" -# Top 10 Domains and their score -SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain' -FROM spam -GROUP BY domain -ORDER BY 1 DESC LIMIT 10; - -# Most frequent messages -SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message -FROM spam -GROUP BY message HAVING bots > 1 -ORDER BY 1 DESC LIMIT 5; - -# report sql -SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain -FROM spam -WHERE domain="default.rs"; -""" diff --git a/report/.gitkeep b/report/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/report/.gitkeep diff --git a/template/abuse-template.txt b/template/abuse-template.txt new file mode 100644 index 0000000..fe729bb --- /dev/null +++ b/template/abuse-template.txt @@ -0,0 +1,28 @@ +Subject: XMPP spam report for {domain} / {ips} + +XMPP domain: {domain} +Server: {srv} +Jabber IP: {ips} + +Hi, + +the above mentioned server is used as an open relay to send vast amounts +of XMPP spam to different unrelated servers, such as the server I +administer. + +Spammers are using the In-Band-Registration feature on that server to +create a large number of accounts, and to send mass messages to my +users. + +Please contact the server owner to disable In-Band-Registration, to take +measures against spam relaying or to shut down the XMPP service. + +Also please find attached a list of the bot accounts and an excerpt of +the spam messages sent to my service. + +{summary} + + +Kind regards, + +{name} |