From f2835ec5891adc303e44b782cd8da2239556f33e Mon Sep 17 00:00:00 2001 From: andrewso <9V5f1FkzI2LD> Date: Wed, 14 Oct 2020 14:33:50 +0100 Subject: [PATCH] [14.10.20] Whitelisted words --- src/main.py | 2 +- src/tweets/tweetFilter.py | 22 ++++++++++++---------- src/tweets/whitelistedWords.py | 5 ++++- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/main.py b/src/main.py index de97072..42c0335 100644 --- a/src/main.py +++ b/src/main.py @@ -6,7 +6,7 @@ sys.path.append('/home/spam-filter/') from threading import Thread from tweets.tweetFilter import tweetFilter -from src.utils.jsonLogger import setup_logging, log +from utils.jsonLogger import setup_logging, log from flask import Flask, request diff --git a/src/tweets/tweetFilter.py b/src/tweets/tweetFilter.py index 457cbdf..72634d6 100644 --- a/src/tweets/tweetFilter.py +++ b/src/tweets/tweetFilter.py @@ -13,9 +13,9 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score -from src.utils.jsonLogger import log +from utils.jsonLogger import log -from src.tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords +from tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords # Global Metrics HB_NB_Precision = 0 @@ -27,8 +27,6 @@ HB_NB_Accuracy = 0 def processTweet(text, gram = 2): tweet = text.lower() #Lower cases - tweet = filterOutTweetsWithNoneWhitelistedWords(tweet) - words = word_tokenize(tweet) #Tokenise words in text words = [w for w in words if len(w) > 2] @@ -255,7 +253,7 @@ class tweetFilter(object): pass def tweetFilterTrain(self): - self.Filter = multinomialNaiveBayes("/home/spam-filter/src/resources/tweet_spam_ham.csv") + self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv") log("Training Filter", 'INFO') self.Filter.trainFilter() @@ -267,9 +265,13 @@ class tweetFilter(object): # Filter.filterStatistics(prediction) # # Filter.testPrediction() - def tweetFilterPredit(self, tweet): - df = pd.DataFrame(self.Filter.predict(tweet)) - df[0] = df[0].map({0: 'ham', 1: 'spam'}) - log("Classification of tweet as {}".format(df[0][0]), 'INFO') + def tweetFilterPredit(self, text): + tweet = filterOutTweetsWithNoneWhitelistedWords(text) + if tweet != "": + df = pd.DataFrame(self.Filter.predict(tweet)) + df[0] = df[0].map({0: 'ham', 1: 'spam'}) + log("Classification of tweet as {}".format(df[0][0]), 'INFO') - return df[0][0] + return df[0][0] + else: + return "spam" \ No newline at end of file diff --git a/src/tweets/whitelistedWords.py b/src/tweets/whitelistedWords.py index cccdefa..22b5f78 100644 --- a/src/tweets/whitelistedWords.py +++ b/src/tweets/whitelistedWords.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +from utils.jsonLogger import log + whitelist = [ "bull", "bear", @@ -145,4 +147,5 @@ def filterOutTweetsWithNoneWhitelistedWords(text): if any(x in text for x in whitelist): return text else: - log("Tweet [{}] did not contain any keywords for it to be considered crypto related", 'WARN') \ No newline at end of file + log("Tweet [{}] did not contain any keywords for it to be considered crypto related".format(text), 'WARN') + return "" \ No newline at end of file