aboutsummaryrefslogtreecommitdiffstats
path: root/main.py
blob: e86faf4619890ac2d7befe475d4aea784783d1d5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import re
import sqlite3
import subprocess

from defusedxml import ElementTree


class AbuseReport:
	"""Ingestation script for ejabberd spam logs"""

	def __init__(self, arguments):
		self.infile = arguments.infile
		self.domain = arguments.domain
		self.conn = sqlite3.connect('spam.db')

		self.jid_pattern = re.compile("^(?:([^\"&'/:<>@]{1,1023})@)?([^/@]{1,1023})(?:/(.{1,1023}))?$")
		self.message_pattern = re.compile(r'<message.*?</message>', re.DOTALL)

	def main(self):
		"""
		method deciding over which action to take
		"""

		if self.infile is None:
			# infile unset -> report top10
			self.report()

		elif self.infile:
			# infile set -> ingest
			self.ingest()

		# close sqlite connection
		self.conn.close()

	def report(self):
		"""
		report method
		:return: top10 score or domain specific data
		"""
		# if a specific domain is supplied return only that set
		if self.domain is not None:
			sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain ' \
				'FROM spam WHERE domain=\'{}\';"'.format(self.domain)
		else:
			sql = 'sqlite3 -column -header spam.db "SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain AS domain ' \
				'FROM spam GROUP BY domain ORDER BY 1 DESC LIMIT 10;"'

		print(subprocess.getoutput(sql))

	def ingest(self):
		"""
		method to ingest xml messages into sqlite database
		"""
		try:
			with open(self.infile, "r", encoding="utf-8") as spam:
				log = re.findall(self.message_pattern, spam.read())

			self.db_import(log)
		except FileNotFoundError as err:
			print(err)
			exit(1)

	def db_import(self, message_log):
		"""
		import xml stanzas into database
		:param message_log: xml messages
		"""
		self.conn.execute('''CREATE TABLE IF NOT EXISTS "spam" ("user" TEXT, "domain" TEXT, "ts" TEXT, "message" TEXT, 
			PRIMARY KEY("domain","ts"));''')

		for message in message_log:
			message_parsed = ElementTree.fromstring(message)

			# parse from tag
			spam_from = message_parsed.get('from')
			match = self.jid_pattern.match(spam_from)
			(node, domain, resource) = match.groups()

			# stamp
			spam_time = message_parsed.find('.//{urn:xmpp:delay}delay').get('stamp')

			# body
			spam_body = message_parsed.find('{jabber:client}body').text

			# format sql
			sql = 'INSERT INTO spam("user", "domain", "ts", "message") VALUES("{}", "{}", "{}", "{}");'.format(
				node, domain, spam_time, spam_body
			)
			try:
				self.conn.execute(sql)
			except sqlite3.IntegrityError:
				pass
			finally:
				self.conn.commit()


if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('-in', '--infile', help='set path to input file', dest='infile')
	parser.add_argument('-d', '--domain', help='specify report domain', dest='domain')
	args = parser.parse_args()

	# run
	AbuseReport(args).main()

"""
# Top 10 Domains and their score
SELECT COUNT(*) AS messages,COUNT(DISTINCT user)  AS bots,domain AS 'domain'
FROM spam 
GROUP BY domain
ORDER BY 1 DESC LIMIT 10;

# Most frequent messages
SELECT COUNT(*) as count, COUNT(distinct user||domain) as bots,message
FROM spam
GROUP BY message HAVING bots > 1 
ORDER BY 1 DESC LIMIT 5;

# report sql
SELECT COUNT(*) AS messages,COUNT(DISTINCT user) AS bots,domain
FROM spam 
WHERE domain="default.rs";
"""