From 8e7ac358a4d0f2149873304361c73870d06f8e18 Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Wed, 29 May 2019 23:04:22 +0200
Subject: initial release report feature

Misc
+ add report directory
+ add template directory

Feature Release
+ add report feature --report
	+ add basic report template
+ add feature to use -d/ --domain flag multiple times

Optimization
+ add config.py loading user config
* further code optimization
* update gitignore file
---
 .gitignore                  |   2 +-
 config.py                   |  19 +++++
 main.py                     | 196 ++++++++++++++++++++++++++++++++++----------
 report/.gitkeep             |   0
 template/abuse-template.txt |  28 +++++++
 5 files changed, 201 insertions(+), 44 deletions(-)
 create mode 100644 config.py
 create mode 100644 report/.gitkeep
 create mode 100644 template/abuse-template.txt
diff --git a/.gitignore b/.gitignore
index cce7ba1..bc8cbd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,4 @@ pip-selfcheck.json
 # project specific files
 spam.db
 config.json
-spam-*.txt*
+/report/*.txt
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..12a41a4
--- /dev/null
+++ b/config.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+import json
+
+# try to read config.json if nonexistent create config.json an populate it
+try:
+	with open("config.json", "r", encoding="utf-8") as f:
+		config = json.load(f)
+
+except FileNotFoundError:
+	with open("config.json", "w", encoding="utf-8") as f:
+		config = {
+			"name": "",
+		}
+		f.write(json.dumps(config))
+
+
+class Config(object):
+	"""extract secret key to use for the webserver"""
+	name = config["name"]
diff --git a/main.py b/main.py
index 250cebb..d6ed2d3 100755
--- a/main.py
+++ b/main.py
@@ -1,13 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import argparse
+import datetime as dt
+import gzip
+import os
 import re
 import sqlite3
 
+import dns.resolver as dns
 import tabulate
 from defusedxml import ElementTree
-import os
-import gzip
+
+from config import Config
 
 
 class AbuseReport:
@@ -16,9 +20,10 @@ class AbuseReport:
 	def __init__(self, arguments):
 		self.infile = arguments.infile
 		self.domain = arguments.domain
+		self.report = arguments.report
 		self.path = os.path.dirname(__file__)
 
-		self.conn = sqlite3.connect("".join([self.path, "/spam.db"]))
+		self.conn = sqlite3.connect("/".join([self.path, "spam.db"]))
 		self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$")
 		self.message_pattern = re.compile(r'<message.*?</message>', re.DOTALL)
 
@@ -26,10 +31,9 @@ class AbuseReport:
 		"""
 		method deciding over which action to take
 		"""
-
 		if self.infile is None:
 			# infile unset -> report top10
-			self.report()
+			self.egest()
 
 		elif self.infile:
 			# infile set -> ingest
@@ -38,33 +42,45 @@ class AbuseReport:
 		# close sqlite connection
 		self.conn.close()
 
-	def report(self):
+	def egest(self):
 		"""
 		report method
 		:return: top10 score or domain specific data
 		"""
-		# if a specific domain is supplied return only that set
+		result = list()
+
+		# if domain is specified return info for that domain
 		if self.domain is not None:
-			# first and last time seen spam from specified domain
-			first = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts LIMIT 1",
-									  {"domain": self.domain}).fetchone()[0]
-			last = self.conn.execute("SELECT ts FROM spam WHERE domain=:domain ORDER BY ts DESC LIMIT 1",
-									 {"domain": self.domain}).fetchone()[0]
+			result = list()
 
-			print("First seen : {first}\nLast seen : {last}\n".format(first=first, last=last))
+			# iterate over all domains supplied
+			for domain in self.domain:
 
-			result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain FROM spam '
-									   'WHERE domain=\'{}\';'.format(self.domain))
-		else:
+				query = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain,
+					MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''',
+					{"domain": domain}).fetchall()
 
-			result = self.conn.execute('SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain '
-									   'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;')
+				# ugly tuple list swapping for nicer formatting
+				temp = list(query[0])
+				if temp[2] is None:
+					temp[2] = domain
+					query[0] = tuple(temp)
 
-		# format data as table
-		table = tabulate.tabulate(result, headers=["messages", "bots", "domain"], tablefmt="orgtbl")
-		print(table)
+				# extend result table
+				result.extend(query)
 
+				# generate report if enabled
+				if self.report:
+					self.gen_report(domain, query)
+		else:
+			# in any other case return top 10
+			result = self.conn.execute('''SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain 
+				FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;''')
 
+		# format data as table
+		table = tabulate.tabulate(result, headers=["messages", "bots", "domain","first seen", "last seen"],
+								  tablefmt="orgtbl")
+		print(table)
 
 	def ingest(self):
 		"""
@@ -73,7 +89,7 @@ class AbuseReport:
 		"""
 		magic_number = b"\x1f\x8b\x08"
 
-		# split up list
+		# iterate over all infile elements
 		for element in self.infile:
 
 			try:
@@ -82,9 +98,9 @@ class AbuseReport:
 					content = infile.read()
 
 			except FileNotFoundError as err:
+				content = ""
 				print(err)
 
-			# check file for gzip magic number
 			# if magic number is present decompress and decode file
 			if content.startswith(magic_number):
 				content = gzip.decompress(content).decode("utf-8")
@@ -92,6 +108,7 @@ class AbuseReport:
 			else:
 				content = content.decode("utf-8")
 
+			# automated run None catch
 			if content is not None:
 				self.parse(content)
 
@@ -143,31 +160,124 @@ class AbuseReport:
 			finally:
 				self.conn.commit()
 
+	def gen_report(self, domain, query):
+		try:
+			# open abuse report template file
+			with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template:
+				report_template = template.read()
+
+		except FileNotFoundError as err:
+			print(err)
+			exit(1)
+
+		# current date
+		now = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d")
+
+		# output to report directory
+		report_filename = "abuse-{domain}-{date}.txt".format(date=now, domain=domain)
+		jids_filename = "abuse-{domain}-{date}-jids.txt".format(date=now, domain=domain)
+		logs_filename = "abuse-{domain}-{date}-logs.txt".format(date=now, domain=domain)
+
+		# write report files
+		with open("/".join([self.path, "report", report_filename]), "w", encoding="utf-8") as report_out:
+			content = self.report_template(report_template, domain, query)
+			report_out.write(content)
+
+		with open("/".join([self.path, "report", jids_filename]), "w", encoding="utf-8") as report_out:
+			content = self.report_jids(domain)
+			report_out.write(content)
+
+		with open("/".join([self.path, "report", logs_filename]), "w", encoding="utf-8") as report_out:
+			content = self.report_logs(domain)
+			report_out.write(content)
+
+	def report_template(self, template, domain, query):
+		name = Config.name
+
+		# lookup srv and domain info
+		info = self.srvlookup(domain)
+		srv = info[0]["host"]
+		ips = "".join(info[0]["ip"])
+		summary = tabulate.tabulate(query, headers=["messages", "bots", "domain","first seen", "last seen"],
+									tablefmt="orgtbl")
+
+		report_out= template.format(name=name, domain=domain, srv=srv, ips=ips, summary=summary)
+
+		return report_out
+
+	def report_jids(self, domain):
+
+		jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user
+			ORDER BY 1;''', {"domain": domain}).fetchall()
+
+		return tabulate.tabulate(jids, tablefmt="plain")
+
+	def report_logs(self, domain):
+		"""
+
+		:param domain:
+		:return:
+		"""
+		logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10)
+			||'========================================================================'||char(10)||message||char(10)||
+			'========================================================================' FROM spam WHERE domain=:domain
+			GROUP BY message ORDER BY COUNT(*) DESC LIMIT 10;''', {"domain": domain}).fetchall()
+
+		return tabulate.tabulate(logs, tablefmt="plain")
+
+	def srvlookup(self, domain):
+		"""
+		srv lookup method for the domain provided, if no srv record is found the base domain is used
+		:type domain: str
+		:param domain: provided domain to query srv records for
+		:return: sorted list of dictionaries containing host and ip info
+		"""
+		# srv
+		query = '_xmpp-client._tcp.{}'.format(domain)
+
+		try:
+			srv_records = dns.query(query, 'SRV')
+
+		except (dns.NXDOMAIN, dns.NoAnswer):
+			# catch NXDOMAIN and NoAnswer tracebacks
+			srv_records = None
+
+		# extract record
+		results = list()
+
+		if srv_records is not None:
+			# extract all available records
+			for record in srv_records:
+				info = dict()
+
+				# gather necessary info from srv records
+				info["host"] = str(record.target).rstrip('.')
+				info["weight"] = record.weight
+				info["priority"] = record.priority
+				info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
+				results.append(info)
+
+			# return list sorted by priority and weight
+			return sorted(results, key=lambda i: (i['priority'], i["weight"]))
+
+		else:
+			# prevent empty info when srv records are not present
+			info = dict()
+
+			# gather necessary info from srv records
+			info["host"] = domain
+			info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
+			results.append(info)
+
+			return results
+
 
 if __name__ == "__main__":
 	parser = argparse.ArgumentParser()
 	parser.add_argument('-in', '--infile', nargs='+', help='set path to input file', dest='infile')
-	parser.add_argument('-d', '--domain', help='specify report domain', dest='domain')
+	parser.add_argument('-d', '--domain', action='append', help='specify report domain', dest='domain')
+	parser.add_argument('-r', '--report', action='store_true',  help='toggle report output to file', dest='report')
 	args = parser.parse_args()
 
 	# run
 	AbuseReport(args).main()
-
-"""
-# Top 10 Domains and their score
-SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS 'domain'
-FROM spam 
-GROUP BY domain
-ORDER BY 1 DESC LIMIT 10;
-
-# Most frequent messages
-SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message
-FROM spam
-GROUP BY message HAVING bots > 1 
-ORDER BY 1 DESC LIMIT 5;
-
-# report sql
-SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain
-FROM spam 
-WHERE domain="default.rs";
-"""
diff --git a/report/.gitkeep b/report/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/template/abuse-template.txt b/template/abuse-template.txt
new file mode 100644
index 0000000..fe729bb
--- /dev/null
+++ b/template/abuse-template.txt
@@ -0,0 +1,28 @@
+Subject: XMPP spam report for {domain} / {ips}
+
+XMPP domain: {domain}
+Server:      {srv}
+Jabber IP:   {ips}
+
+Hi,
+
+the above mentioned server is used as an open relay to send vast amounts
+of XMPP spam to different unrelated servers, such as the server I
+administer.
+
+Spammers are using the In-Band-Registration feature on that server to
+create a large number of accounts, and to send mass messages to my
+users.
+
+Please contact the server owner to disable In-Band-Registration, to take
+measures against spam relaying or to shut down the XMPP service.
+
+Also please find attached a list of the bot accounts and an excerpt of
+the spam messages sent to my service.
+
+{summary}
+
+
+Kind regards,
+
+{name}
-- 
cgit v1.2.3-54-g00ecf


From 6b2bf0719324856a0bbacb9b47dea88a7b9199d1 Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Thu, 30 May 2019 02:18:45 +0200
Subject: readme update

* update README.md to the latest changes
---
 README.md | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 10d92d9..b11279a 100644
--- a/README.md
+++ b/README.md
@@ -14,25 +14,48 @@ pip install -r requirements.txt
 modules:
   mod_spam_filter:
     ...
-    spam_dump_file: "/var/log/ejabberd/spam-example.de.txt"
+    spam_dump_file: "/var/log/ejabberd/spam-@HOST@.txt"
     ...
 ```
 
 ## usage main.py
 ```
-usage: main.py [-h] [-in INFILE] [-d DOMAIN]
+usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r]
 
 optional arguments:
   -h, --help            show this help message and exit
-  -in INFILE, --infile INFILE
+  -in INFILE [INFILE ...], --infile INFILE [INFILE ...]
                         set path to input file
   -d DOMAIN, --domain DOMAIN
                         specify report domain
+  -r, --report          toggle report output to file
 ```
 
-The `--in` argument does only support a single log file at a time.
+#### -in / --infile
+The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefor the script is 
+able to process gzip compressed files and also multiple files at once via shell expansion.
 
-## usage abusereport-domain.sh
+##### example
+If ejabberd is configured to create multiple spamdump files it is possible to ingest all files at once, following 
+this example.
 ```bash
-./abusereport-domain.sh domain.tld
-```
\ No newline at end of file
+$ ./main.py --in /var/log/ejabberd/spam-*.log
+```
+
+#### -d / --domain
+If a domain is specifically defined to be processed, the script will only query the sqlite database for that domain. 
+It is possible to provide multiple domains at once via multiple `-d` or `--domain` arguments.
+
+##### example
+```bash
+$ ./main.py --d example.tld -d example.com
+
+|   messages |   bots | domain      | first seen                  | last seen                   |
+|------------+--------+-------------+-----------------------------+-----------------------------|
+|         15 |      9 | example.tld | 2019-04-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z |
+|         23 |      7 | example.com | 2018-02-28T20:19:43.939926Z | 2019-05-22T13:59:53.339834Z |
+```
+
+#### -r / --report
+This flag will only take effect if the `-d` or `--domain` argument is used. If that is the case, the script will 
+automatically gather information about the specified domain and write them to the `report` directory.
-- 
cgit v1.2.3-54-g00ecf


From 0d9f4147b08be59e2f6dc98efce24942985a590b Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Thu, 30 May 2019 02:35:23 +0200
Subject: unnecessary else

* remove unnecessary else clause
---
 main.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/main.py b/main.py
index d6ed2d3..efb3701 100755
--- a/main.py
+++ b/main.py
@@ -260,16 +260,15 @@ class AbuseReport:
 			# return list sorted by priority and weight
 			return sorted(results, key=lambda i: (i['priority'], i["weight"]))
 
-		else:
-			# prevent empty info when srv records are not present
-			info = dict()
+		# prevent empty info when srv records are not present
+		info = dict()
 
-			# gather necessary info from srv records
-			info["host"] = domain
-			info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
-			results.append(info)
+		# gather necessary info from srv records
+		info["host"] = domain
+		info["ip"] = [ip.address for ip in dns.query(info["host"], "A")]
+		results.append(info)
 
-			return results
+		return results
 
 
 if __name__ == "__main__":
-- 
cgit v1.2.3-54-g00ecf


From 911450c72e83985a8ff8a7f1301184515f4aa351 Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Thu, 30 May 2019 02:49:38 +0200
Subject: readme config.py addition

* update README to explain config.py
---
 README.md | 13 +++++++++++++
 config.py |  1 -
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b11279a..97c056f 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,19 @@ modules:
     ...
 ```
 
+### config.json
+The `config.json` file is used to preserve date from possible updates to this script. `config.py` will load `config
+.json` to extract the name, which is used to sign the report message with. In the future there might be other things 
+the `config.json` may contain.
+
+```json
+$ cat config.json
+{
+  "name": "username"
+}
+```
+
+
 ## usage main.py
 ```
 usage: main.py [-h] [-in INFILE [INFILE ...]] [-d DOMAIN] [-r]
diff --git a/config.py b/config.py
index 12a41a4..ff6f9fa 100644
--- a/config.py
+++ b/config.py
@@ -15,5 +15,4 @@ except FileNotFoundError:
 
 
 class Config(object):
-	"""extract secret key to use for the webserver"""
 	name = config["name"]
-- 
cgit v1.2.3-54-g00ecf


From f0940cc6152faec3695301c1f70430fddc38a898 Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Fri, 31 May 2019 15:31:21 +0200
Subject: further documentation

* add more docstrings to main.py
* add README entry for no argument

Thanks to @weiss
---
 README.md | 24 +++++++++++++++++++++++-
 main.py   | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 97c056f..5fca75e 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,30 @@ optional arguments:
   -r, --report          toggle report output to file
 ```
 
+#### run with no argument
+If `main.py` is run without any arguments attached, then the script will output a "top 10" table showing the amount 
+of messages/ bots for the most spammy domains in the database.
+
+##### example
+```bash
+$./main.py
+
+|   messages |   bots | domain        |
+|------------+--------+---------------|
+|         42 |      1 | example.net   |
+|         17 |      9 | example.rs    |
+|          7 |      5 | example.cd    |
+|          5 |      3 | example.de    |
+|          4 |      4 | example.ru    |
+|          3 |      1 | example.co.uk |
+|          3 |      3 | example.com   |
+|          3 |      1 | example.net   |
+|          3 |      1 | example.fr    |
+|          3 |      1 | example.com   |
+```
+
 #### -in / --infile
-The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefor the script is 
+The `--in` or `--infile` argument is designed to run automatically via the logrotate daemon. Therefore the script is 
 able to process gzip compressed files and also multiple files at once via shell expansion.
 
 ##### example
diff --git a/main.py b/main.py
index efb3701..7090eb5 100755
--- a/main.py
+++ b/main.py
@@ -44,8 +44,9 @@ class AbuseReport:
 
 	def egest(self):
 		"""
-		report method
-		:return: top10 score or domain specific data
+		egest method
+		if specific domain is supplied return only those results
+		in any other case return top 10 table
 		"""
 		result = list()
 
@@ -60,7 +61,8 @@ class AbuseReport:
 					MIN(ts) AS first,MAX(ts) AS last FROM spam WHERE domain = :domain;''',
 					{"domain": domain}).fetchall()
 
-				# ugly tuple list swapping for nicer formatting
+				# if specified domain is not listed yet, the resulting table would miss the domain name
+				# this ugle tuple 2 list swap prevents this behaviour
 				temp = list(query[0])
 				if temp[2] is None:
 					temp[2] = domain
@@ -115,6 +117,7 @@ class AbuseReport:
 	def parse(self, infile):
 		"""
 		method to parse xml messages
+		:type infile: str
 		:param infile: string containing xml stanzas
 		"""
 		log = re.findall(self.message_pattern, infile)
@@ -125,6 +128,7 @@ class AbuseReport:
 	def db_import(self, message_log):
 		"""
 		import xml stanzas into database
+		:type infile: str
 		:param message_log: xml messages
 		"""
 		self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT, 
@@ -161,6 +165,12 @@ class AbuseReport:
 				self.conn.commit()
 
 	def gen_report(self, domain, query):
+		"""
+		method generating the report files
+		:type domain: str
+		:param domain: string containing a domain name
+		:param query: sqlite cursor object containing the query results for the specified domain
+		"""
 		try:
 			# open abuse report template file
 			with open("/".join([self.path, "template/abuse-template.txt"]), "r", encoding="utf-8") as template:
@@ -192,6 +202,15 @@ class AbuseReport:
 			report_out.write(content)
 
 	def report_template(self, template, domain, query):
+		"""
+		method to collect and format the template file to the final abuse report
+		:type template: str
+		:type domain: str
+		:param template: string containing the abuse report template
+		:param domain: string containing a domain name
+		:param query: sqlite cursor object containing the query results for the specified domain
+		:return: string containing the fully formatted abuse report
+		"""
 		name = Config.name
 
 		# lookup srv and domain info
@@ -206,6 +225,12 @@ class AbuseReport:
 		return report_out
 
 	def report_jids(self, domain):
+		"""
+		method to collect all involved jids from the database
+		:type domain: str
+		:param domain: string containing a domain name
+		:return: formatted string containing the result
+		"""
 
 		jids = self.conn.execute('''SELECT user || '@' || domain as jid FROM spam WHERE domain=:domain GROUP BY user
 			ORDER BY 1;''', {"domain": domain}).fetchall()
@@ -214,9 +239,10 @@ class AbuseReport:
 
 	def report_logs(self, domain):
 		"""
-
-		:param domain:
-		:return:
+		method to collect all messages grouped by frequency
+		:type domain: str
+		:param domain: string containing a domain name
+		:return: formatted string containing the result
 		"""
 		logs = self.conn.execute('''SELECT char(10)||MIN(ts)||' - '||MAX(ts)||char(10)||COUNT(*)||' messages:'||char(10)
 			||'========================================================================'||char(10)||message||char(10)||
-- 
cgit v1.2.3-54-g00ecf


From 0d7e2f0c7cef6b7a853107bf37d44816244e7749 Mon Sep 17 00:00:00 2001
From: nico <nico@magicbroccoli.de>
Date: Sun, 2 Jun 2019 15:30:54 +0200
Subject: file system clutter

+ add correct path to config.json to prevent file system clutter
---
 config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/config.py b/config.py
index ff6f9fa..fac5f9b 100644
--- a/config.py
+++ b/config.py
@@ -1,13 +1,18 @@
 # -*- coding: utf-8 -*-
 import json
+import os
+
+# filepath of the config.json in the project directory
+path = os.path.dirname(__file__)
+filepath = ("/".join([path, "config.json"]))
 
 # try to read config.json if nonexistent create config.json an populate it
 try:
-	with open("config.json", "r", encoding="utf-8") as f:
+	with open(filepath, "r", encoding="utf-8") as f:
 		config = json.load(f)
 
 except FileNotFoundError:
-	with open("config.json", "w", encoding="utf-8") as f:
+	with open(filepath, "w", encoding="utf-8") as f:
 		config = {
 			"name": "",
 		}
-- 
cgit v1.2.3-54-g00ecf