272 lines
9.3 KiB
Python
272 lines
9.3 KiB
Python
#!/usr/bin/env python
|
|
|
|
import os, sys
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import PorterStemmer
|
|
from math import log, sqrt
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import classification_report, accuracy_score
|
|
|
|
from src.utils.jsonLogger import log
|
|
|
|
# Global Metrics
|
|
HB_NB_Precision = 0
|
|
HB_NB_Recall = 0
|
|
HB_NB_F_Score = 0
|
|
HB_NB_Accuracy = 0
|
|
|
|
## Logic
|
|
def processTweet(tweet, gram = 2):
|
|
tweet = tweet.lower() #Lower cases
|
|
|
|
words = word_tokenize(tweet) #Tokenise words in text
|
|
words = [w for w in words if len(w) > 2]
|
|
|
|
if gram > 2: ## Increasing grams can increase accuracy
|
|
w = []
|
|
for i in range(len(words) - gram + 1):
|
|
w += [' '.join(words[i:i + gram])]
|
|
return w
|
|
|
|
# Remove stopwords
|
|
sw = stopwords.words('english')
|
|
words = [word for word in words if word not in sw]
|
|
|
|
stemmer = PorterStemmer() # Stem words
|
|
words = [stemmer.stem(word) for word in words]
|
|
|
|
return words
|
|
|
|
class classifier(object):
|
|
def __init__(self, trainData):
|
|
self.tweet = trainData['tweet']
|
|
self.labels = trainData['class']
|
|
|
|
def train(self):
|
|
self.TF_and_IDF() ## Bag of Words
|
|
self.TF_IDF() ## Term Frequecies
|
|
|
|
def TF_and_IDF(self):
|
|
noTweets = self.tweet.shape[0]
|
|
self.spam = self.labels.value_counts()[1]
|
|
self.ham = self.labels.value_counts()[0]
|
|
self.total = self.spam + self.ham
|
|
|
|
# Initialise spam vars
|
|
self.spamCount = 0
|
|
self.hamCount = 0
|
|
self.tfSpam = dict()
|
|
self.tfHam = dict()
|
|
self.idfSpam = dict()
|
|
self.idfHam = dict()
|
|
|
|
## Logic
|
|
|
|
for entry in range(noTweets):
|
|
processed = processTweet(self.tweet[entry])
|
|
count = list() #To keep track of whether the word has occured in the message or not. IDF count
|
|
for word in processed:
|
|
if self.labels[entry]:
|
|
self.tfSpam[word] = self.tfSpam.get(word, 0) + 1
|
|
self.spamCount += 1
|
|
else:
|
|
self.tfHam[word] = self.tfHam.get(word, 0) + 1
|
|
self.hamCount += 1
|
|
if word not in count: ## And below is Addictive Smoothing
|
|
count += [word]
|
|
for word in count:
|
|
if self.labels[entry]:
|
|
self.idfSpam[word] = self.idfSpam.get(word, 0) + 1
|
|
else:
|
|
self.idfHam[word] = self.idfHam.get(word, 0) + 1
|
|
|
|
def TF_IDF(self):
|
|
self.probSpam = dict()
|
|
self.probHam = dict()
|
|
self.sumSpam = 0
|
|
self.sumHam = 0
|
|
for word in self.tfSpam:
|
|
self.probSpam[word] = (self.tfSpam[word]) * log((self.spam + self.ham) / (self.idfSpam[word] + self.idfHam.get(word, 0)))
|
|
self.sumSpam += self.probSpam[word]
|
|
for word in self.tfSpam:
|
|
self.probSpam[word] = (self.probSpam[word] + 1) / (self.sumSpam + len(list(self.probSpam.keys())))
|
|
for word in self.tfHam:
|
|
self.probHam[word] = (self.tfHam[word]) * log((self.spam + self.ham) / (self.idfSpam.get(word, 0) + self.idfHam[word]))
|
|
self.sumHam += self.probHam[word]
|
|
for word in self.tfHam:
|
|
self.probHam[word] = (self.probHam[word] + 1) / (self.sumHam + len(list(self.probHam.keys())))
|
|
|
|
self.probSpamTotal, self.probHamTotal = self.spam / self.total, self.ham / self.total
|
|
|
|
def classify(self, processed):
|
|
pSpam, pHam = 0, 0
|
|
for word in processed:
|
|
if word in self.probSpam:
|
|
pSpam += log(self.probSpam[word])
|
|
else:
|
|
pSpam -= log(self.sumSpam + len(list(self.probSpam.keys())))
|
|
if word in self.probHam:
|
|
pHam += log(self.probHam[word])
|
|
else:
|
|
pHam -= log(self.sumHam + len(list(self.probHam.keys())))
|
|
pSpam += log(self.probSpamTotal)
|
|
pHam += log(self.probHamTotal)
|
|
return pSpam >= pHam
|
|
|
|
def predict(self, testData):
|
|
result = dict()
|
|
for (i, tweet) in enumerate(testData):
|
|
processed = processTweet(tweet)
|
|
result[i] = int(self.classify(processed))
|
|
return result
|
|
|
|
def metrics(labels, predictions):
|
|
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
|
|
for i in range(len(labels)):
|
|
true_pos += int(labels[i] == 1 and predictions[i] == 1)
|
|
true_neg += int(labels[i] == 0 and predictions[i] == 0)
|
|
false_pos += int(labels[i] == 0 and predictions[i] == 1)
|
|
false_neg += int(labels[i] == 1 and predictions[i] == 0)
|
|
HB_NB_Precision = true_pos / (true_pos + false_pos)
|
|
HB_NB_Recall = true_pos / (true_pos + false_neg)
|
|
HB_NB_F_Score = 2 * HB_NB_Precision * HB_NB_Recall / (HB_NB_Precision + HB_NB_Recall)
|
|
HB_NB_Accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
|
|
|
|
print("HB Precision: ", HB_NB_Precision)
|
|
print("HB Recall: ", HB_NB_Recall)
|
|
print("HB F-score: ", HB_NB_F_Score)
|
|
print("HB Accuracy: ", HB_NB_Accuracy)
|
|
|
|
class filterSpam(object):
|
|
def __init__(self, training_set):
|
|
self.training_set = training_set
|
|
|
|
def trainFilter(self):
|
|
self.dataset()
|
|
self.train()
|
|
|
|
def dataset(self):
|
|
self.data = pd.read_csv(self.training_set)
|
|
|
|
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
|
|
|
|
self.data.drop(['classes'], axis=1, inplace=True)
|
|
|
|
self.trainIndex, self.testIndex = list(), list()
|
|
for i in range(self.data.shape[0]):
|
|
if np.random.uniform(0, 1) < 0.75:
|
|
self.trainIndex += [i]
|
|
else:
|
|
self.testIndex += [i]
|
|
self.trainData = self.data.loc[self.trainIndex]
|
|
self.testData = self.data.loc[self.testIndex]
|
|
|
|
self.trainData.reset_index(inplace=True)
|
|
self.testData.reset_index(inplace=True)
|
|
self.trainData.drop(['index'], axis=1, inplace=True)
|
|
self.testData.drop(['index'], axis=1, inplace=True)
|
|
|
|
def train(self):
|
|
self.spamFilter = classifier(self.trainData)
|
|
self.spamFilter.train()
|
|
|
|
def testData_Prediction(self):
|
|
prediction = self.spamFilter.predict(self.testData['tweet'])
|
|
|
|
return prediction
|
|
|
|
def testPrediction(self):
|
|
|
|
# Test Spam/Ham tweets - should return True and False respectivly
|
|
spam = processTweet("Earn more than 0015 btc free No deposit No investment Free Bitcoins - Earn $65 free btc in 5 minutes bitcoin freebtc getbtc")
|
|
ham = processTweet("Bitcoin closed with some gains in month of February")
|
|
|
|
hamTweet = self.spamFilter.classify(ham)
|
|
spamTweet = self.spamFilter.classify(spam)
|
|
|
|
print("Console: ", "Spam Tweet -- ", spamTweet)
|
|
sys.stdout.flush()
|
|
print("Console: ", "Ham Tweet -- ", hamTweet)
|
|
sys.stdout.flush()
|
|
|
|
def filterStatistics(self, prediction):
|
|
metrics(self.testData['class'], prediction)
|
|
|
|
def filterTweet(self, tweet):
|
|
|
|
processed = processTweet(tweet)
|
|
|
|
classified = self.spamFilter.classify(processed)
|
|
|
|
return classified
|
|
|
|
class multinomialNaiveBayes(object):
|
|
def __init__(self, training_set):
|
|
self.training_set = training_set
|
|
|
|
def trainFilter(self):
|
|
self.dataset()
|
|
self.train()
|
|
self.predictTest()
|
|
|
|
def dataset(self):
|
|
log("Creating Training and Test datasets", 'INFO')
|
|
self.data = pd.read_csv(self.training_set)
|
|
|
|
self.data.drop_duplicates(inplace = True)
|
|
|
|
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
|
|
|
|
self.cv = CountVectorizer(analyzer=processTweet)
|
|
|
|
messages_bow = self.cv.fit_transform(self.data['tweet'])
|
|
|
|
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(messages_bow, self.data['class'], test_size = 0.20, random_state = 0)
|
|
|
|
def train(self):
|
|
self.classifier = MultinomialNB()
|
|
|
|
log("Fitting Split Train datasets against Bayes Classifier", 'INFO')
|
|
self.classifier.fit(self.X_train, self.y_train)
|
|
|
|
def predictTest(self):
|
|
log("Testing Prediction agaist X Test Dataset", 'INFO')
|
|
self.pred = self.classifier.predict(self.X_test)
|
|
|
|
log('Accuracy Prediction against Y Test Dataset :: {}'.format(accuracy_score(self.y_test, self.pred)), 'INFO')
|
|
|
|
def predict(self, tweet):
|
|
message = self.cv.transform([tweet]).toarray()
|
|
|
|
return self.classifier.predict(message)
|
|
|
|
class tweetFilter(object):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def tweetFilterTrain(self):
|
|
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
|
|
|
|
log("Training Filter", 'INFO')
|
|
self.Filter.trainFilter()
|
|
|
|
### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
|
|
# Filter.trainFilter()
|
|
#
|
|
# prediction = Filter.testData_Prediction()
|
|
# Filter.filterStatistics(prediction)
|
|
#
|
|
# Filter.testPrediction()
|
|
def tweetFilterPredit(self, tweet):
|
|
df = pd.DataFrame(self.Filter.predict(tweet))
|
|
df[0] = df[0].map({0: 'ham', 1: 'spam'})
|
|
log("Classification of tweet as {}".format(df[0][0]), 'INFO')
|
|
|
|
return df[0][0]
|