[14.10.20] Whitelisted words
This commit is contained in:
parent
9733c9d00d
commit
f2835ec589
@ -6,7 +6,7 @@ sys.path.append('/home/spam-filter/')
|
||||
from threading import Thread
|
||||
from tweets.tweetFilter import tweetFilter
|
||||
|
||||
from src.utils.jsonLogger import setup_logging, log
|
||||
from utils.jsonLogger import setup_logging, log
|
||||
|
||||
from flask import Flask, request
|
||||
|
||||
|
||||
@ -13,9 +13,9 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import classification_report, accuracy_score
|
||||
|
||||
from src.utils.jsonLogger import log
|
||||
from utils.jsonLogger import log
|
||||
|
||||
from src.tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords
|
||||
from tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords
|
||||
|
||||
# Global Metrics
|
||||
HB_NB_Precision = 0
|
||||
@ -27,8 +27,6 @@ HB_NB_Accuracy = 0
|
||||
def processTweet(text, gram = 2):
|
||||
tweet = text.lower() #Lower cases
|
||||
|
||||
tweet = filterOutTweetsWithNoneWhitelistedWords(tweet)
|
||||
|
||||
words = word_tokenize(tweet) #Tokenise words in text
|
||||
words = [w for w in words if len(w) > 2]
|
||||
|
||||
@ -255,7 +253,7 @@ class tweetFilter(object):
|
||||
pass
|
||||
|
||||
def tweetFilterTrain(self):
|
||||
self.Filter = multinomialNaiveBayes("/home/spam-filter/src/resources/tweet_spam_ham.csv")
|
||||
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
|
||||
|
||||
log("Training Filter", 'INFO')
|
||||
self.Filter.trainFilter()
|
||||
@ -267,9 +265,13 @@ class tweetFilter(object):
|
||||
# Filter.filterStatistics(prediction)
|
||||
#
|
||||
# Filter.testPrediction()
|
||||
def tweetFilterPredit(self, tweet):
|
||||
df = pd.DataFrame(self.Filter.predict(tweet))
|
||||
df[0] = df[0].map({0: 'ham', 1: 'spam'})
|
||||
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
|
||||
def tweetFilterPredit(self, text):
|
||||
tweet = filterOutTweetsWithNoneWhitelistedWords(text)
|
||||
if tweet != "":
|
||||
df = pd.DataFrame(self.Filter.predict(tweet))
|
||||
df[0] = df[0].map({0: 'ham', 1: 'spam'})
|
||||
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
|
||||
|
||||
return df[0][0]
|
||||
return df[0][0]
|
||||
else:
|
||||
return "spam"
|
||||
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from utils.jsonLogger import log
|
||||
|
||||
whitelist = [
|
||||
"bull",
|
||||
"bear",
|
||||
@ -145,4 +147,5 @@ def filterOutTweetsWithNoneWhitelistedWords(text):
|
||||
if any(x in text for x in whitelist):
|
||||
return text
|
||||
else:
|
||||
log("Tweet [{}] did not contain any keywords for it to be considered crypto related", 'WARN')
|
||||
log("Tweet [{}] did not contain any keywords for it to be considered crypto related".format(text), 'WARN')
|
||||
return ""
|
||||
Loading…
x
Reference in New Issue
Block a user