spam-filter/src/tweets/tweetFilter.py

272 lines
9.3 KiB
Python

#!/usr/bin/env python
import os, sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from math import log, sqrt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from src.utils.jsonLogger import log
# Global Metrics
HB_NB_Precision = 0
HB_NB_Recall = 0
HB_NB_F_Score = 0
HB_NB_Accuracy = 0
## Logic
def processTweet(tweet, gram = 2):
tweet = tweet.lower() #Lower cases
words = word_tokenize(tweet) #Tokenise words in text
words = [w for w in words if len(w) > 2]
if gram > 2: ## Increasing grams can increase accuracy
w = []
for i in range(len(words) - gram + 1):
w += [' '.join(words[i:i + gram])]
return w
# Remove stopwords
sw = stopwords.words('english')
words = [word for word in words if word not in sw]
stemmer = PorterStemmer() # Stem words
words = [stemmer.stem(word) for word in words]
return words
class classifier(object):
def __init__(self, trainData):
self.tweet = trainData['tweet']
self.labels = trainData['class']
def train(self):
self.TF_and_IDF() ## Bag of Words
self.TF_IDF() ## Term Frequecies
def TF_and_IDF(self):
noTweets = self.tweet.shape[0]
self.spam = self.labels.value_counts()[1]
self.ham = self.labels.value_counts()[0]
self.total = self.spam + self.ham
# Initialise spam vars
self.spamCount = 0
self.hamCount = 0
self.tfSpam = dict()
self.tfHam = dict()
self.idfSpam = dict()
self.idfHam = dict()
## Logic
for entry in range(noTweets):
processed = processTweet(self.tweet[entry])
count = list() #To keep track of whether the word has occured in the message or not. IDF count
for word in processed:
if self.labels[entry]:
self.tfSpam[word] = self.tfSpam.get(word, 0) + 1
self.spamCount += 1
else:
self.tfHam[word] = self.tfHam.get(word, 0) + 1
self.hamCount += 1
if word not in count: ## And below is Addictive Smoothing
count += [word]
for word in count:
if self.labels[entry]:
self.idfSpam[word] = self.idfSpam.get(word, 0) + 1
else:
self.idfHam[word] = self.idfHam.get(word, 0) + 1
def TF_IDF(self):
self.probSpam = dict()
self.probHam = dict()
self.sumSpam = 0
self.sumHam = 0
for word in self.tfSpam:
self.probSpam[word] = (self.tfSpam[word]) * log((self.spam + self.ham) / (self.idfSpam[word] + self.idfHam.get(word, 0)))
self.sumSpam += self.probSpam[word]
for word in self.tfSpam:
self.probSpam[word] = (self.probSpam[word] + 1) / (self.sumSpam + len(list(self.probSpam.keys())))
for word in self.tfHam:
self.probHam[word] = (self.tfHam[word]) * log((self.spam + self.ham) / (self.idfSpam.get(word, 0) + self.idfHam[word]))
self.sumHam += self.probHam[word]
for word in self.tfHam:
self.probHam[word] = (self.probHam[word] + 1) / (self.sumHam + len(list(self.probHam.keys())))
self.probSpamTotal, self.probHamTotal = self.spam / self.total, self.ham / self.total
def classify(self, processed):
pSpam, pHam = 0, 0
for word in processed:
if word in self.probSpam:
pSpam += log(self.probSpam[word])
else:
pSpam -= log(self.sumSpam + len(list(self.probSpam.keys())))
if word in self.probHam:
pHam += log(self.probHam[word])
else:
pHam -= log(self.sumHam + len(list(self.probHam.keys())))
pSpam += log(self.probSpamTotal)
pHam += log(self.probHamTotal)
return pSpam >= pHam
def predict(self, testData):
result = dict()
for (i, tweet) in enumerate(testData):
processed = processTweet(tweet)
result[i] = int(self.classify(processed))
return result
def metrics(labels, predictions):
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
for i in range(len(labels)):
true_pos += int(labels[i] == 1 and predictions[i] == 1)
true_neg += int(labels[i] == 0 and predictions[i] == 0)
false_pos += int(labels[i] == 0 and predictions[i] == 1)
false_neg += int(labels[i] == 1 and predictions[i] == 0)
HB_NB_Precision = true_pos / (true_pos + false_pos)
HB_NB_Recall = true_pos / (true_pos + false_neg)
HB_NB_F_Score = 2 * HB_NB_Precision * HB_NB_Recall / (HB_NB_Precision + HB_NB_Recall)
HB_NB_Accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
print("HB Precision: ", HB_NB_Precision)
print("HB Recall: ", HB_NB_Recall)
print("HB F-score: ", HB_NB_F_Score)
print("HB Accuracy: ", HB_NB_Accuracy)
class filterSpam(object):
def __init__(self, training_set):
self.training_set = training_set
def trainFilter(self):
self.dataset()
self.train()
def dataset(self):
self.data = pd.read_csv(self.training_set)
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
self.data.drop(['classes'], axis=1, inplace=True)
self.trainIndex, self.testIndex = list(), list()
for i in range(self.data.shape[0]):
if np.random.uniform(0, 1) < 0.75:
self.trainIndex += [i]
else:
self.testIndex += [i]
self.trainData = self.data.loc[self.trainIndex]
self.testData = self.data.loc[self.testIndex]
self.trainData.reset_index(inplace=True)
self.testData.reset_index(inplace=True)
self.trainData.drop(['index'], axis=1, inplace=True)
self.testData.drop(['index'], axis=1, inplace=True)
def train(self):
self.spamFilter = classifier(self.trainData)
self.spamFilter.train()
def testData_Prediction(self):
prediction = self.spamFilter.predict(self.testData['tweet'])
return prediction
def testPrediction(self):
# Test Spam/Ham tweets - should return True and False respectivly
spam = processTweet("Earn more than 0015 btc free No deposit No investment Free Bitcoins - Earn $65 free btc in 5 minutes bitcoin freebtc getbtc")
ham = processTweet("Bitcoin closed with some gains in month of February")
hamTweet = self.spamFilter.classify(ham)
spamTweet = self.spamFilter.classify(spam)
print("Console: ", "Spam Tweet -- ", spamTweet)
sys.stdout.flush()
print("Console: ", "Ham Tweet -- ", hamTweet)
sys.stdout.flush()
def filterStatistics(self, prediction):
metrics(self.testData['class'], prediction)
def filterTweet(self, tweet):
processed = processTweet(tweet)
classified = self.spamFilter.classify(processed)
return classified
class multinomialNaiveBayes(object):
def __init__(self, training_set):
self.training_set = training_set
def trainFilter(self):
self.dataset()
self.train()
self.predictTest()
def dataset(self):
log("Creating Training and Test datasets", 'INFO')
self.data = pd.read_csv(self.training_set)
self.data.drop_duplicates(inplace = True)
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
self.cv = CountVectorizer(analyzer=processTweet)
messages_bow = self.cv.fit_transform(self.data['tweet'])
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(messages_bow, self.data['class'], test_size = 0.20, random_state = 0)
def train(self):
self.classifier = MultinomialNB()
log("Fitting Split Train datasets against Bayes Classifier", 'INFO')
self.classifier.fit(self.X_train, self.y_train)
def predictTest(self):
log("Testing Prediction agaist X Test Dataset", 'INFO')
self.pred = self.classifier.predict(self.X_test)
log('Accuracy Prediction against Y Test Dataset :: {}'.format(accuracy_score(self.y_test, self.pred)), 'INFO')
def predict(self, tweet):
message = self.cv.transform([tweet]).toarray()
return self.classifier.predict(message)
class tweetFilter(object):
def __init__(self):
pass
def tweetFilterTrain(self):
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
log("Training Filter", 'INFO')
self.Filter.trainFilter()
### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
# Filter.trainFilter()
#
# prediction = Filter.testData_Prediction()
# Filter.filterStatistics(prediction)
#
# Filter.testPrediction()
def tweetFilterPredit(self, tweet):
df = pd.DataFrame(self.Filter.predict(tweet))
df[0] = df[0].map({0: 'ham', 1: 'spam'})
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
return df[0][0]