[08.10.20] Final Spam filter classification with 2.5k new records to train on
This commit is contained in:
parent
12454d17d2
commit
9c4b3c3de9
@ -2,7 +2,7 @@ FROM python:3.7
|
||||
MAINTAINER Andrew Sotheran <cryptosky.user@gmail.com>
|
||||
RUN apt update -y && \
|
||||
apt install -y python3-pip && \
|
||||
pip3 install nltk numpy sklearn flask pandas && \
|
||||
pip3 install nltk numpy sklearn flask pandas python-json-logger && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
COPY . /home/spam-filter/.
|
||||
RUN python3 /home/spam-filter/configuration/scripts/nltk_package_downloads.py
|
||||
|
||||
12
src/main.py
12
src/main.py
@ -5,8 +5,8 @@ sys.path.append('/home/spam-filter/')
|
||||
|
||||
from threading import Thread
|
||||
from tweets.tweetFilter import tweetFilter
|
||||
# from tweets.tweetFilter import tweetFilter
|
||||
# from news.newsFilter import newsFilter
|
||||
|
||||
from src.utils.jsonLogger import setup_logging, log
|
||||
|
||||
from flask import Flask, request
|
||||
|
||||
@ -20,7 +20,11 @@ filter = tweetFilter()
|
||||
def tweetPredict():
|
||||
tweet = request.args.get('tweet')
|
||||
|
||||
log("Receiving Tweet to classify {}".format(tweet), 'INFO')
|
||||
|
||||
result = filter.tweetFilterPredit(tweet)
|
||||
|
||||
log("Returning classification result", 'INFO')
|
||||
return json.dumps({'result': result, 'tweet': tweet}), 200, {'ContentType':'application/json'}
|
||||
|
||||
def callTweetFilter():
|
||||
@ -35,7 +39,9 @@ def callProbes():
|
||||
runFlaskProbes()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Console: ", "==== Spam Filter - Tweets & News ====")
|
||||
setup_logging()
|
||||
|
||||
log("Starting Spam Filter...", 'INFO')
|
||||
sys.stdout.flush()
|
||||
|
||||
Thread(target=callProbes).start()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -13,6 +13,8 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import classification_report, accuracy_score
|
||||
|
||||
from src.utils.jsonLogger import log
|
||||
|
||||
# Global Metrics
|
||||
HB_NB_Precision = 0
|
||||
HB_NB_Recall = 0
|
||||
@ -214,6 +216,7 @@ class multinomialNaiveBayes(object):
|
||||
self.predictTest()
|
||||
|
||||
def dataset(self):
|
||||
log("Creating Training and Test datasets", 'INFO')
|
||||
self.data = pd.read_csv(self.training_set)
|
||||
|
||||
self.data.drop_duplicates(inplace = True)
|
||||
@ -228,11 +231,15 @@ class multinomialNaiveBayes(object):
|
||||
|
||||
def train(self):
|
||||
self.classifier = MultinomialNB()
|
||||
|
||||
log("Fitting Split Train datasets against Bayes Classifier", 'INFO')
|
||||
self.classifier.fit(self.X_train, self.y_train)
|
||||
|
||||
def predictTest(self):
|
||||
log("Testing Prediction agaist X Test Dataset", 'INFO')
|
||||
self.pred = self.classifier.predict(self.X_test)
|
||||
print('Accuracy: ', accuracy_score(self.y_test, self.pred))
|
||||
|
||||
log('Accuracy Prediction against Y Test Dataset :: {}'.format(accuracy_score(self.y_test, self.pred)), 'INFO')
|
||||
|
||||
def predict(self, tweet):
|
||||
message = self.cv.transform([tweet]).toarray()
|
||||
@ -244,8 +251,9 @@ class tweetFilter(object):
|
||||
pass
|
||||
|
||||
def tweetFilterTrain(self):
|
||||
self.Filter = multinomialNaiveBayes("/home/spam-filter/src/resources/tweet_spam_ham.csv")
|
||||
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
|
||||
|
||||
log("Training Filter", 'INFO')
|
||||
self.Filter.trainFilter()
|
||||
|
||||
### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
|
||||
@ -258,4 +266,6 @@ class tweetFilter(object):
|
||||
def tweetFilterPredit(self, tweet):
|
||||
df = pd.DataFrame(self.Filter.predict(tweet))
|
||||
df[0] = df[0].map({0: 'ham', 1: 'spam'})
|
||||
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
|
||||
|
||||
return df[0][0]
|
||||
|
||||
40
src/utils/jsonLogger.py
Normal file
40
src/utils/jsonLogger.py
Normal file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
from pythonjsonlogger import jsonlogger
|
||||
|
||||
import datetime
|
||||
|
||||
class CustomJsonFormatter(jsonlogger.JsonFormatter):
|
||||
def add_fields(self, log_record, record, message_dict):
|
||||
super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict)
|
||||
if not log_record.get('@timestamp'):
|
||||
# this doesn't use record.created, so it is slightly off
|
||||
now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
|
||||
log_record['@timestamp'] = now
|
||||
if log_record.get('level'):
|
||||
log_record['level'] = log_record['level'].upper()
|
||||
else:
|
||||
log_record['level'] = record.levelname
|
||||
|
||||
def setup_logging(log_level='INFO'):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.propagate = 0
|
||||
logger.setLevel(log_level)
|
||||
logHandler = logging.StreamHandler()
|
||||
|
||||
formatter = CustomJsonFormatter('%(@timestamp)s %(level)s %(name)s %(message)s')
|
||||
|
||||
logHandler.setFormatter(formatter)
|
||||
logger.addHandler(logHandler)
|
||||
|
||||
def log(message, level):
|
||||
logger = logging.getLogger(__name__)
|
||||
if level == 'INFO':
|
||||
logger.info(message)
|
||||
elif level == 'WARN':
|
||||
logger.warn(message)
|
||||
elif level == 'ERR':
|
||||
logger.error(message)
|
||||
elif level == 'DEBUG':
|
||||
logger.debug(message)
|
||||
Loading…
x
Reference in New Issue
Block a user