[08.10.20] Final Spam filter classification with 2.5k new records to train on

This commit is contained in:
andrewso 2020-10-08 10:56:03 +01:00
parent 12454d17d2
commit 9c4b3c3de9
6 changed files with 2616 additions and 208 deletions

View File

@ -2,7 +2,7 @@ FROM python:3.7
MAINTAINER Andrew Sotheran <cryptosky.user@gmail.com>
RUN apt update -y && \
apt install -y python3-pip && \
pip3 install nltk numpy sklearn flask pandas && \
pip3 install nltk numpy sklearn flask pandas python-json-logger && \
rm -rf /var/lib/apt/lists/*
COPY . /home/spam-filter/.
RUN python3 /home/spam-filter/configuration/scripts/nltk_package_downloads.py

View File

@ -5,8 +5,8 @@ sys.path.append('/home/spam-filter/')
from threading import Thread
from tweets.tweetFilter import tweetFilter
# from tweets.tweetFilter import tweetFilter
# from news.newsFilter import newsFilter
from src.utils.jsonLogger import setup_logging, log
from flask import Flask, request
@ -20,7 +20,11 @@ filter = tweetFilter()
def tweetPredict():
tweet = request.args.get('tweet')
log("Receiving Tweet to classify {}".format(tweet), 'INFO')
result = filter.tweetFilterPredit(tweet)
log("Returning classification result", 'INFO')
return json.dumps({'result': result, 'tweet': tweet}), 200, {'ContentType':'application/json'}
def callTweetFilter():
@ -35,7 +39,9 @@ def callProbes():
runFlaskProbes()
if __name__ == '__main__':
print("Console: ", "==== Spam Filter - Tweets & News ====")
setup_logging()
log("Starting Spam Filter...", 'INFO')
sys.stdout.flush()
Thread(target=callProbes).start()

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,8 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from src.utils.jsonLogger import log
# Global Metrics
HB_NB_Precision = 0
HB_NB_Recall = 0
@ -214,6 +216,7 @@ class multinomialNaiveBayes(object):
self.predictTest()
def dataset(self):
log("Creating Training and Test datasets", 'INFO')
self.data = pd.read_csv(self.training_set)
self.data.drop_duplicates(inplace = True)
@ -228,11 +231,15 @@ class multinomialNaiveBayes(object):
def train(self):
self.classifier = MultinomialNB()
log("Fitting Split Train datasets against Bayes Classifier", 'INFO')
self.classifier.fit(self.X_train, self.y_train)
def predictTest(self):
log("Testing Prediction agaist X Test Dataset", 'INFO')
self.pred = self.classifier.predict(self.X_test)
print('Accuracy: ', accuracy_score(self.y_test, self.pred))
log('Accuracy Prediction against Y Test Dataset :: {}'.format(accuracy_score(self.y_test, self.pred)), 'INFO')
def predict(self, tweet):
message = self.cv.transform([tweet]).toarray()
@ -244,8 +251,9 @@ class tweetFilter(object):
pass
def tweetFilterTrain(self):
self.Filter = multinomialNaiveBayes("/home/spam-filter/src/resources/tweet_spam_ham.csv")
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
log("Training Filter", 'INFO')
self.Filter.trainFilter()
### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
@ -258,4 +266,6 @@ class tweetFilter(object):
def tweetFilterPredit(self, tweet):
df = pd.DataFrame(self.Filter.predict(tweet))
df[0] = df[0].map({0: 'ham', 1: 'spam'})
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
return df[0][0]

40
src/utils/jsonLogger.py Normal file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
import logging
from pythonjsonlogger import jsonlogger
import datetime
class CustomJsonFormatter(jsonlogger.JsonFormatter):
def add_fields(self, log_record, record, message_dict):
super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict)
if not log_record.get('@timestamp'):
# this doesn't use record.created, so it is slightly off
now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
log_record['@timestamp'] = now
if log_record.get('level'):
log_record['level'] = log_record['level'].upper()
else:
log_record['level'] = record.levelname
def setup_logging(log_level='INFO'):
logger = logging.getLogger(__name__)
logger.propagate = 0
logger.setLevel(log_level)
logHandler = logging.StreamHandler()
formatter = CustomJsonFormatter('%(@timestamp)s %(level)s %(name)s %(message)s')
logHandler.setFormatter(formatter)
logger.addHandler(logHandler)
def log(message, level):
logger = logging.getLogger(__name__)
if level == 'INFO':
logger.info(message)
elif level == 'WARN':
logger.warn(message)
elif level == 'ERR':
logger.error(message)
elif level == 'DEBUG':
logger.debug(message)