[14.10.20] Whitelisted words

This commit is contained in:
andrewso 2020-10-14 14:33:50 +01:00
parent 9733c9d00d
commit f2835ec589
3 changed files with 17 additions and 12 deletions

View File

@ -6,7 +6,7 @@ sys.path.append('/home/spam-filter/')
from threading import Thread
from tweets.tweetFilter import tweetFilter
from src.utils.jsonLogger import setup_logging, log
from utils.jsonLogger import setup_logging, log
from flask import Flask, request

View File

@ -13,9 +13,9 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from src.utils.jsonLogger import log
from utils.jsonLogger import log
from src.tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords
from tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords
# Global Metrics
HB_NB_Precision = 0
@ -27,8 +27,6 @@ HB_NB_Accuracy = 0
def processTweet(text, gram = 2):
tweet = text.lower() #Lower cases
tweet = filterOutTweetsWithNoneWhitelistedWords(tweet)
words = word_tokenize(tweet) #Tokenise words in text
words = [w for w in words if len(w) > 2]
@ -255,7 +253,7 @@ class tweetFilter(object):
pass
def tweetFilterTrain(self):
self.Filter = multinomialNaiveBayes("/home/spam-filter/src/resources/tweet_spam_ham.csv")
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
log("Training Filter", 'INFO')
self.Filter.trainFilter()
@ -267,9 +265,13 @@ class tweetFilter(object):
# Filter.filterStatistics(prediction)
#
# Filter.testPrediction()
def tweetFilterPredit(self, tweet):
df = pd.DataFrame(self.Filter.predict(tweet))
df[0] = df[0].map({0: 'ham', 1: 'spam'})
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
def tweetFilterPredit(self, text):
tweet = filterOutTweetsWithNoneWhitelistedWords(text)
if tweet != "":
df = pd.DataFrame(self.Filter.predict(tweet))
df[0] = df[0].map({0: 'ham', 1: 'spam'})
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
return df[0][0]
return df[0][0]
else:
return "spam"

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
from utils.jsonLogger import log
whitelist = [
"bull",
"bear",
@ -145,4 +147,5 @@ def filterOutTweetsWithNoneWhitelistedWords(text):
if any(x in text for x in whitelist):
return text
else:
log("Tweet [{}] did not contain any keywords for it to be considered crypto related", 'WARN')
log("Tweet [{}] did not contain any keywords for it to be considered crypto related".format(text), 'WARN')
return ""