[05.03.20] Spam filter code for tweets, kubernetes deployments, pipeline, training for naive bayes and endpoint exposure

This commit is contained in:
andrewso 2020-03-05 18:42:52 +00:00
parent a418a08aa1
commit 7b519630a2
12 changed files with 1533 additions and 0 deletions

9
Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM python:3.7-alpine
MAINTAINER Andrew Sotheran <cryptosky.user@gmail.com>
RUN apk update && \
apk add py-pip libc-dev gcc
RUN pip install utils pycryptodome && \
pip install nltk pandas numpy sklearn flask
COPY . /home/spam-filter/.
EXPOSE 9090
CMD ["python", "/home/spam-filter/src/main.py"]

2
README.md Normal file
View File

@ -0,0 +1,2 @@
# Spam-Filter
Universal Spam Filter - for news and tweets data

View File

@ -0,0 +1,77 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: LABEL
name: RESOURCE_NAME
namespace: production
spec:
replicas: 1
selector:
matchLabels:
app: RESOURCE_NAME
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
template:
metadata:
labels:
app: RESOURCE_NAME
spec:
containers:
- image: REPOSITORY/IMAGE
name: RESOURCE_NAME
env:
- name: KUBERNETES_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: CONTAINER_CORE_LIMIT
valueFrom:
resourceFieldRef:
resource: limits.cpu
- name: CONTAINER_MAX_MEMORY
valueFrom:
resourceFieldRef:
resource: limits.memory
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: endpoints
key: dbGateway.url
- name: DATABASE_PORT
valueFrom:
secretKeyRef:
name: endpoints
key: dbGateway.port
- name: COINBASE_KEY
valueFrom:
secretKeyRef:
name: coinbase-secret
key: coinbase.api.key
- name: COINBASE_SECRET
valueFrom:
secretKeyRef:
name: coinbase-secret
key: coinbase.api.secret
- name: DB_GATEWAY_URL
valueFrom:
secretKeyRef:
name: endpoints
key: dbGateway.url
ports:
- containerPort: 9090
name: RESOURCE_NAME
imagePullPolicy: Always
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 100m
memory: 32Mi
restartPolicy: Always
imagePullSecrets:
- name: registry-secret

View File

@ -0,0 +1,16 @@
kind: Service
apiVersion: v1
metadata:
labels:
name: LABEL
name: RESOURCE_NAME
namespace: production
spec:
selector:
app: RESOURCE_NAME
ports:
- port: 9090
protocol: HTTP
targetPort: 9090
sessionAffinity: None
type: ClusterIP

View File

@ -0,0 +1,100 @@
#!/usr/bin/env groovy
env.APPLICATION_NAME = 'spam-filter'
env.APPLICATION_LABEL = 'utilities'
env.GIT_BRANCH = 'master'
env.GIT_REPOSITORY_PATH = "github.com/andyjk15/${env.APPLICATION_NAME}.git"
env.GIT_REPOSITORY_URL = "https://${env.GIT_REPOSITORY_PATH}"
env.GITHUB_CREDENTIALS_ID = 'Github'
env.DOCKER_REPOSITORY = 'registry.cryptosky.me'
env.DOCKER_REPOSITORY_URL = "https://${env.DOCKER_REPOSITORY}"
env.DOCKER_REPOSITORY_TCP = "tcp://${env.DOCKER_REPOSITORY}:4243"
env.NAMESPACE = 'production'
env.SLAVE_LABEL = "cryptosky-aio-build"
def mvn( String gloals ) {
sh "mvn -s configuration/settings.xml --show-version --batch-mode ${gloals}"
}
String get_application_version() {
"1.0.0-b${env.BUILD_NUMBER}"
}
String executeShellScript( String shellPath, String arg1 = '', String arg2 = '', String arg3 = '', String arg4 = '' ) {
sh "./${shellPath} ${arg1} ${arg2} ${arg3} ${arg4}"
}
try {
timestamps {
node ("${env.SLAVE_LABEL}") {
stage('Initialise') {
checkout([$class: 'GitSCM', branches: [[name: 'master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: 'Github', url: 'https://github.com/andyjk15/price-collector.git']]])
env.APPLICATION_VERSION = get_application_version()
withCredentials(
[usernamePassword(
credentialsId: 'doctl',
passwordVariable: 'DOCTL_TOKEN',
usernameVariable: 'DOCTL_USERNAME'
)]
) {
sh "doctl auth init --access-token ${DOCTL_TOKEN}"
sh "doctl kubernetes cluster kubeconfig save cryptosky-kubernetes-cluster-production"
}
}
stage('Test Artifact') {
try {
// mvn 'verify -DskipUTs -DskipTests'
} finally {
// mvn 'test'
}
}
stage('Build Image') {
// mvn 'clean package -DskipTests'
executeShellScript("configuration/scripts/mapVarsToConfigs.sh",
env.DOCKER_REPOSITORY,
env.APPLICATION_NAME,
env.APPLICATION_VERSION,
env.APPLICATION_LABEL)
}
stage('Tag Repository') {
withDockerServer([uri: "${env.DOCKER_REPOSITORY_TCP}"]) {
withDockerRegistry([credentialsId: 'Registry', url: "${env.DOCKER_REPOSITORY_URL}"]) {
def image = docker.build("${env.DOCKER_REPOSITORY}/${env.APPLICATION_NAME}:${env.APPLICATION_VERSION}")
image.push()
def latest = docker.build("${env.DOCKER_REPOSITORY}/${env.APPLICATION_NAME}:latest")
latest.push()
}
}
withCredentials(
[usernamePassword(
credentialsId: env.GITHUB_CREDENTIALS_ID,
passwordVariable: 'GIT_PASSWORD',
usernameVariable: 'GIT_USERNAME'
)]
) {
sh "git tag ${env.APPLICATION_VERSION}"
sh "git push https://${GIT_USERNAME}:${GIT_PASSWORD}@${env.GIT_REPOSITORY_PATH} ${env.APPLICATION_VERSION}"
}
}
stage('Deploy') {
executeShellScript("configuration/scripts/deployToKubernetes.sh",
env.APPLICATION_NAME)
}
}
}
} catch ( exception ) {
currentBuild.result = 'FAILURE'
throw exception
} finally {
currentBuild.result = 'SUCCESS'
}

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
APPLICATION_NAME=$1
kubectl apply -f configuration/kubernetes/deployment.yaml
kubectl apply -f configuration/kubernetes/service.yaml
kubectl get pods
kubectl rollout status deployment/${APPLICATION_NAME} --namespace=production

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
DOCKER_REPOSITORY=$1
APPLICATION_NAME=$2
APPLICATION_VERSION=$3
APPLICATION_LABEL=$4
sed -i "s/REPOSITORY/${DOCKER_REPOSITORY}/g" configuration/kubernetes/deployment.yaml
sed -i "s/IMAGE/${APPLICATION_NAME}:${APPLICATION_VERSION}/g" configuration/kubernetes/deployment.yaml
sed -i "s/RESOURCE_NAME/${APPLICATION_NAME}/g" configuration/kubernetes/deployment.yaml
sed -i "s/LABEL/${APPLICATION_LABEL}/g" configuration/kubernetes/deployment.yaml
sed -i "s/RESOURCE_NAME/${APPLICATION_NAME}/g" configuration/kubernetes/service.yaml
sed -i "s/LABEL/${APPLICATION_LABEL}/g" configuration/kubernetes/service.yaml

0
src/__init__.py Normal file
View File

37
src/main.py Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
import sys, json
sys.path.append('/home/spam-filter/')
from threading import Thread
from tweets.tweetFilter import tweetFilter
# from tweets.tweetFilter import tweetFilter
# from news.newsFilter import newsFilter
from flask import Flask, request
app = Flask(__name__)
filter = tweetFilter()
@app.route('/predict', methods=['GET'])
def tweetPredict():
tweet = request.args.get('tweet')
result = filter.tweetFilterPredit(tweet)
return json.dumps({'result': result, 'tweet': tweet}), 200, {'ContentType':'application/json'}
def callTweetFilter():
filter.tweetFilterTrain()
app.run(port=9090)
# def callNewsFilter():
# newsFilter()
if __name__ == '__main__':
print("Console: ", "==== Spam Filter - Tweets & News ====")
sys.stdout.flush()
Thread(target=callTweetFilter).start()
# Thread(target=callNewsFilter).start()

View File

File diff suppressed because it is too large Load Diff

261
src/tweets/tweetFilter.py Normal file
View File

@ -0,0 +1,261 @@
#!/usr/bin/env python
import os, sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from math import log, sqrt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
# Global Metrics
HB_NB_Precision = 0
HB_NB_Recall = 0
HB_NB_F_Score = 0
HB_NB_Accuracy = 0
## Logic
def processTweet(tweet, gram = 2):
tweet = tweet.lower() #Lower cases
words = word_tokenize(tweet) #Tokenise words in text
words = [w for w in words if len(w) > 2]
if gram > 2: ## Increasing grams can increase accuracy
w = []
for i in range(len(words) - gram + 1):
w += [' '.join(words[i:i + gram])]
return w
# Remove stopwords
sw = stopwords.words('english')
words = [word for word in words if word not in sw]
stemmer = PorterStemmer() # Stem words
words = [stemmer.stem(word) for word in words]
return words
class classifier(object):
def __init__(self, trainData):
self.tweet = trainData['tweet']
self.labels = trainData['class']
def train(self):
self.TF_and_IDF() ## Bag of Words
self.TF_IDF() ## Term Frequecies
def TF_and_IDF(self):
noTweets = self.tweet.shape[0]
self.spam = self.labels.value_counts()[1]
self.ham = self.labels.value_counts()[0]
self.total = self.spam + self.ham
# Initialise spam vars
self.spamCount = 0
self.hamCount = 0
self.tfSpam = dict()
self.tfHam = dict()
self.idfSpam = dict()
self.idfHam = dict()
## Logic
for entry in range(noTweets):
processed = processTweet(self.tweet[entry])
count = list() #To keep track of whether the word has occured in the message or not. IDF count
for word in processed:
if self.labels[entry]:
self.tfSpam[word] = self.tfSpam.get(word, 0) + 1
self.spamCount += 1
else:
self.tfHam[word] = self.tfHam.get(word, 0) + 1
self.hamCount += 1
if word not in count: ## And below is Addictive Smoothing
count += [word]
for word in count:
if self.labels[entry]:
self.idfSpam[word] = self.idfSpam.get(word, 0) + 1
else:
self.idfHam[word] = self.idfHam.get(word, 0) + 1
def TF_IDF(self):
self.probSpam = dict()
self.probHam = dict()
self.sumSpam = 0
self.sumHam = 0
for word in self.tfSpam:
self.probSpam[word] = (self.tfSpam[word]) * log((self.spam + self.ham) / (self.idfSpam[word] + self.idfHam.get(word, 0)))
self.sumSpam += self.probSpam[word]
for word in self.tfSpam:
self.probSpam[word] = (self.probSpam[word] + 1) / (self.sumSpam + len(list(self.probSpam.keys())))
for word in self.tfHam:
self.probHam[word] = (self.tfHam[word]) * log((self.spam + self.ham) / (self.idfSpam.get(word, 0) + self.idfHam[word]))
self.sumHam += self.probHam[word]
for word in self.tfHam:
self.probHam[word] = (self.probHam[word] + 1) / (self.sumHam + len(list(self.probHam.keys())))
self.probSpamTotal, self.probHamTotal = self.spam / self.total, self.ham / self.total
def classify(self, processed):
pSpam, pHam = 0, 0
for word in processed:
if word in self.probSpam:
pSpam += log(self.probSpam[word])
else:
pSpam -= log(self.sumSpam + len(list(self.probSpam.keys())))
if word in self.probHam:
pHam += log(self.probHam[word])
else:
pHam -= log(self.sumHam + len(list(self.probHam.keys())))
pSpam += log(self.probSpamTotal)
pHam += log(self.probHamTotal)
return pSpam >= pHam
def predict(self, testData):
result = dict()
for (i, tweet) in enumerate(testData):
processed = processTweet(tweet)
result[i] = int(self.classify(processed))
return result
def metrics(labels, predictions):
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
for i in range(len(labels)):
true_pos += int(labels[i] == 1 and predictions[i] == 1)
true_neg += int(labels[i] == 0 and predictions[i] == 0)
false_pos += int(labels[i] == 0 and predictions[i] == 1)
false_neg += int(labels[i] == 1 and predictions[i] == 0)
HB_NB_Precision = true_pos / (true_pos + false_pos)
HB_NB_Recall = true_pos / (true_pos + false_neg)
HB_NB_F_Score = 2 * HB_NB_Precision * HB_NB_Recall / (HB_NB_Precision + HB_NB_Recall)
HB_NB_Accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
print("HB Precision: ", HB_NB_Precision)
print("HB Recall: ", HB_NB_Recall)
print("HB F-score: ", HB_NB_F_Score)
print("HB Accuracy: ", HB_NB_Accuracy)
class filterSpam(object):
def __init__(self, training_set):
self.training_set = training_set
def trainFilter(self):
self.dataset()
self.train()
def dataset(self):
self.data = pd.read_csv(self.training_set)
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
self.data.drop(['classes'], axis=1, inplace=True)
self.trainIndex, self.testIndex = list(), list()
for i in range(self.data.shape[0]):
if np.random.uniform(0, 1) < 0.75:
self.trainIndex += [i]
else:
self.testIndex += [i]
self.trainData = self.data.loc[self.trainIndex]
self.testData = self.data.loc[self.testIndex]
self.trainData.reset_index(inplace=True)
self.testData.reset_index(inplace=True)
self.trainData.drop(['index'], axis=1, inplace=True)
self.testData.drop(['index'], axis=1, inplace=True)
def train(self):
self.spamFilter = classifier(self.trainData)
self.spamFilter.train()
def testData_Prediction(self):
prediction = self.spamFilter.predict(self.testData['tweet'])
return prediction
def testPrediction(self):
# Test Spam/Ham tweets - should return True and False respectivly
spam = processTweet("Earn more than 0015 btc free No deposit No investment Free Bitcoins - Earn $65 free btc in 5 minutes bitcoin freebtc getbtc")
ham = processTweet("Bitcoin closed with some gains in month of February")
hamTweet = self.spamFilter.classify(ham)
spamTweet = self.spamFilter.classify(spam)
print("Console: ", "Spam Tweet -- ", spamTweet)
sys.stdout.flush()
print("Console: ", "Ham Tweet -- ", hamTweet)
sys.stdout.flush()
def filterStatistics(self, prediction):
metrics(self.testData['class'], prediction)
def filterTweet(self, tweet):
processed = processTweet(tweet)
classified = self.spamFilter.classify(processed)
return classified
class multinomialNaiveBayes(object):
def __init__(self, training_set):
self.training_set = training_set
def trainFilter(self):
self.dataset()
self.train()
self.predictTest()
def dataset(self):
self.data = pd.read_csv(self.training_set)
self.data.drop_duplicates(inplace = True)
self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
self.cv = CountVectorizer(analyzer=processTweet)
messages_bow = self.cv.fit_transform(self.data['tweet'])
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(messages_bow, self.data['class'], test_size = 0.20, random_state = 0)
def train(self):
self.classifier = MultinomialNB()
self.classifier.fit(self.X_train, self.y_train)
def predictTest(self):
self.pred = self.classifier.predict(self.X_test)
print('Accuracy: ', accuracy_score(self.y_test, self.pred))
def predict(self, tweet):
message = self.cv.transform([tweet]).toarray()
return self.classifier.predict(message)
class tweetFilter(object):
def __init__(self):
pass
def tweetFilterTrain(self):
self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
self.Filter.trainFilter()
### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
# Filter.trainFilter()
#
# prediction = Filter.testData_Prediction()
# Filter.filterStatistics(prediction)
#
# Filter.testPrediction()
def tweetFilterPredit(self, tweet):
df = pd.DataFrame(self.Filter.predict(tweet))
df[0] = df[0].map({0: 'ham', 1: 'spam'})
return df[0][0]