[05.03.20] Spam filter code for tweets, kubernetes deployments, pipeline, training for naive bayes and endpoint exposure

2020-03-05 18:42:52 +00:00 · 2020-03-05 18:42:52 +00:00 · 7b519630a2
commit 7b519630a2
parent a418a08aa1
12 changed files with 1533 additions and 0 deletions
--- a/9
+++ b/9
@ -0,0 +1,9 @@
+FROM python:3.7-alpine
+MAINTAINER Andrew Sotheran <cryptosky.user@gmail.com>
+RUN apk update && \
+    apk add py-pip libc-dev gcc
+RUN pip install utils pycryptodome && \
+    pip install nltk pandas numpy sklearn flask
+COPY . /home/spam-filter/.
+EXPOSE 9090
+CMD ["python", "/home/spam-filter/src/main.py"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,2 @@
+# Spam-Filter
+Universal Spam Filter - for news and tweets data
--- a/configuration/kubernetes/deployment.yaml
+++ b/configuration/kubernetes/deployment.yaml
@ -0,0 +1,77 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    name: LABEL
+  name: RESOURCE_NAME
+  namespace: production
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: RESOURCE_NAME
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  template:
+    metadata:
+      labels:
+        app: RESOURCE_NAME
+    spec:
+      containers:
+      - image: REPOSITORY/IMAGE
+        name: RESOURCE_NAME
+        env:
+          - name: KUBERNETES_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: CONTAINER_CORE_LIMIT
+            valueFrom:
+              resourceFieldRef:
+                resource: limits.cpu
+          - name: CONTAINER_MAX_MEMORY
+            valueFrom:
+              resourceFieldRef:
+                resource: limits.memory
+          - name: DATABASE_URL
+            valueFrom:
+              secretKeyRef:
+                name: endpoints
+                key: dbGateway.url
+          - name: DATABASE_PORT
+            valueFrom:
+              secretKeyRef:
+                name: endpoints
+                key: dbGateway.port
+          - name: COINBASE_KEY
+            valueFrom:
+              secretKeyRef:
+                name: coinbase-secret
+                key: coinbase.api.key
+          - name: COINBASE_SECRET
+            valueFrom:
+              secretKeyRef:
+                name: coinbase-secret
+                key: coinbase.api.secret
+          - name: DB_GATEWAY_URL
+            valueFrom:
+              secretKeyRef:
+                name: endpoints
+                key: dbGateway.url
+        ports:
+        - containerPort: 9090
+          name: RESOURCE_NAME
+        imagePullPolicy: Always
+        resources:
+          requests:
+            cpu: 50m
+            memory: 32Mi
+          limits:
+            cpu: 100m
+            memory: 32Mi
+      restartPolicy: Always
+      imagePullSecrets:
+        - name: registry-secret
--- a/configuration/kubernetes/service.yaml
+++ b/configuration/kubernetes/service.yaml
@ -0,0 +1,16 @@
+kind: Service
+apiVersion: v1
+metadata:
+  labels:
+    name: LABEL
+  name: RESOURCE_NAME
+  namespace: production
+spec:
+  selector:
+    app: RESOURCE_NAME
+  ports:
+    - port: 9090
+      protocol: HTTP
+      targetPort: 9090
+  sessionAffinity: None
+  type: ClusterIP
--- a/configuration/pipelines/build.groovy
+++ b/configuration/pipelines/build.groovy
@ -0,0 +1,100 @@
+#!/usr/bin/env groovy
+
+env.APPLICATION_NAME    = 'spam-filter'
+env.APPLICATION_LABEL   = 'utilities'
+env.GIT_BRANCH          = 'master'
+env.GIT_REPOSITORY_PATH = "github.com/andyjk15/${env.APPLICATION_NAME}.git"
+env.GIT_REPOSITORY_URL  = "https://${env.GIT_REPOSITORY_PATH}"
+env.GITHUB_CREDENTIALS_ID = 'Github'
+env.DOCKER_REPOSITORY   = 'registry.cryptosky.me'
+env.DOCKER_REPOSITORY_URL = "https://${env.DOCKER_REPOSITORY}"
+env.DOCKER_REPOSITORY_TCP = "tcp://${env.DOCKER_REPOSITORY}:4243"
+
+env.NAMESPACE           = 'production'
+env.SLAVE_LABEL         = "cryptosky-aio-build"
+
+def mvn( String gloals ) {
+    sh "mvn -s configuration/settings.xml --show-version --batch-mode ${gloals}"
+}
+
+String get_application_version() {
+    "1.0.0-b${env.BUILD_NUMBER}"
+}
+
+String executeShellScript( String shellPath, String arg1 = '', String arg2 = '', String arg3 = '', String arg4 = '' ) {
+    sh "./${shellPath} ${arg1} ${arg2} ${arg3} ${arg4}"
+}
+
+try {
+    timestamps {
+    node ("${env.SLAVE_LABEL}") {
+        stage('Initialise') {
+            checkout([$class: 'GitSCM', branches: [[name: 'master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: 'Github', url: 'https://github.com/andyjk15/price-collector.git']]])
+
+            env.APPLICATION_VERSION = get_application_version()
+
+            withCredentials(
+                    [usernamePassword(
+                            credentialsId: 'doctl',
+                            passwordVariable: 'DOCTL_TOKEN',
+                            usernameVariable: 'DOCTL_USERNAME'
+                    )]
+            ) {
+                sh "doctl auth init --access-token ${DOCTL_TOKEN}"
+                sh "doctl kubernetes cluster kubeconfig save cryptosky-kubernetes-cluster-production"
+            }
+        }
+
+        stage('Test Artifact') {
+            try {
+//                mvn 'verify -DskipUTs -DskipTests'
+            } finally {
+//                mvn 'test'
+            }
+        }
+
+        stage('Build Image') {
+//            mvn 'clean package -DskipTests'
+
+            executeShellScript("configuration/scripts/mapVarsToConfigs.sh",
+                    env.DOCKER_REPOSITORY,
+                    env.APPLICATION_NAME,
+                    env.APPLICATION_VERSION,
+                    env.APPLICATION_LABEL)
+
+        }
+
+        stage('Tag Repository') {
+
+            withDockerServer([uri: "${env.DOCKER_REPOSITORY_TCP}"]) {
+                withDockerRegistry([credentialsId: 'Registry', url: "${env.DOCKER_REPOSITORY_URL}"]) {
+                    def image = docker.build("${env.DOCKER_REPOSITORY}/${env.APPLICATION_NAME}:${env.APPLICATION_VERSION}")
+                    image.push()
+                    def latest = docker.build("${env.DOCKER_REPOSITORY}/${env.APPLICATION_NAME}:latest")
+                    latest.push()
+                }
+            }
+            withCredentials(
+                    [usernamePassword(
+                            credentialsId: env.GITHUB_CREDENTIALS_ID,
+                            passwordVariable: 'GIT_PASSWORD',
+                            usernameVariable: 'GIT_USERNAME'
+                    )]
+            ) {
+                sh "git tag ${env.APPLICATION_VERSION}"
+                sh "git push https://${GIT_USERNAME}:${GIT_PASSWORD}@${env.GIT_REPOSITORY_PATH} ${env.APPLICATION_VERSION}"
+            }
+        }
+
+        stage('Deploy') {
+            executeShellScript("configuration/scripts/deployToKubernetes.sh",
+                env.APPLICATION_NAME)
+        }
+    }
+    }
+} catch ( exception ) {
+    currentBuild.result = 'FAILURE'
+    throw exception
+} finally {
+    currentBuild.result = 'SUCCESS'
+}
--- a/configuration/scripts/deployToKubernetes.sh
+++ b/configuration/scripts/deployToKubernetes.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+APPLICATION_NAME=$1
+
+kubectl apply -f configuration/kubernetes/deployment.yaml
+kubectl apply -f configuration/kubernetes/service.yaml
+
+kubectl get pods
+
+kubectl rollout status deployment/${APPLICATION_NAME} --namespace=production
--- a/configuration/scripts/mapVarsToConfigs.sh
+++ b/configuration/scripts/mapVarsToConfigs.sh
@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+DOCKER_REPOSITORY=$1
+APPLICATION_NAME=$2
+APPLICATION_VERSION=$3
+APPLICATION_LABEL=$4
+
+sed -i "s/REPOSITORY/${DOCKER_REPOSITORY}/g" configuration/kubernetes/deployment.yaml
+sed -i "s/IMAGE/${APPLICATION_NAME}:${APPLICATION_VERSION}/g" configuration/kubernetes/deployment.yaml
+sed -i "s/RESOURCE_NAME/${APPLICATION_NAME}/g" configuration/kubernetes/deployment.yaml
+sed -i "s/LABEL/${APPLICATION_LABEL}/g" configuration/kubernetes/deployment.yaml
+
+sed -i "s/RESOURCE_NAME/${APPLICATION_NAME}/g" configuration/kubernetes/service.yaml
+sed -i "s/LABEL/${APPLICATION_LABEL}/g" configuration/kubernetes/service.yaml
--- a/src/init.py
+++ b/src/init.py
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+import sys, json
+sys.path.append('/home/spam-filter/')
+
+from threading import Thread
+from tweets.tweetFilter import tweetFilter
+# from tweets.tweetFilter import tweetFilter
+# from news.newsFilter import newsFilter
+
+from flask import Flask, request
+
+app = Flask(__name__)
+
+filter = tweetFilter()
+
+@app.route('/predict', methods=['GET'])
+def tweetPredict():
+    tweet = request.args.get('tweet')
+
+    result = filter.tweetFilterPredit(tweet)
+    return json.dumps({'result': result, 'tweet': tweet}), 200, {'ContentType':'application/json'}
+
+def callTweetFilter():
+    filter.tweetFilterTrain()
+
+    app.run(port=9090)
+
+# def callNewsFilter():
+    # newsFilter()
+
+if __name__ == '__main__':
+    print("Console: ", "==== Spam Filter - Tweets & News ====")
+    sys.stdout.flush()
+
+    Thread(target=callTweetFilter).start()
+    # Thread(target=callNewsFilter).start()
--- a/src/resources/news_spam_ham.csv
+++ b/src/resources/news_spam_ham.csv
--- a/src/resources/tweet_spam_ham.csv
+++ b/src/resources/tweet_spam_ham.csv
--- a/src/tweets/tweetFilter.py
+++ b/src/tweets/tweetFilter.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python
+
+import os, sys
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from math import log, sqrt
+import pandas as pd
+import numpy as np
+
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, accuracy_score
+
+# Global Metrics
+HB_NB_Precision = 0
+HB_NB_Recall = 0
+HB_NB_F_Score = 0
+HB_NB_Accuracy = 0
+
+## Logic
+def processTweet(tweet, gram = 2):
+    tweet = tweet.lower() #Lower cases
+
+    words = word_tokenize(tweet)    #Tokenise words in text
+    words = [w for w in words if len(w) > 2]
+
+    if gram > 2:    ## Increasing grams can increase accuracy
+        w = []
+        for i in range(len(words) - gram + 1):
+            w += [' '.join(words[i:i + gram])]
+        return w
+
+    # Remove stopwords
+    sw = stopwords.words('english')
+    words = [word for word in words if word not in sw]
+
+    stemmer = PorterStemmer()           # Stem words
+    words = [stemmer.stem(word) for word in words]
+
+    return words
+
+class classifier(object):
+    def __init__(self, trainData):
+        self.tweet = trainData['tweet']
+        self.labels = trainData['class']
+
+    def train(self):
+        self.TF_and_IDF()   ## Bag of Words
+        self.TF_IDF()       ## Term Frequecies
+
+    def TF_and_IDF(self):
+        noTweets = self.tweet.shape[0]
+        self.spam = self.labels.value_counts()[1]
+        self.ham = self.labels.value_counts()[0]
+        self.total = self.spam + self.ham
+
+        # Initialise spam vars
+        self.spamCount = 0
+        self.hamCount = 0
+        self.tfSpam = dict()
+        self.tfHam = dict()
+        self.idfSpam = dict()
+        self.idfHam = dict()
+
+        ## Logic
+
+        for entry in range(noTweets):
+            processed = processTweet(self.tweet[entry])
+            count = list() #To keep track of whether the word has occured in the message or not. IDF count
+            for word in processed:
+                if self.labels[entry]:
+                    self.tfSpam[word] = self.tfSpam.get(word, 0) + 1
+                    self.spamCount += 1
+                else:
+                    self.tfHam[word] = self.tfHam.get(word, 0) + 1
+                    self.hamCount += 1
+                if word not in count:       ## And below is Addictive Smoothing
+                    count += [word]
+            for word in count:
+                if self.labels[entry]:
+                    self.idfSpam[word] = self.idfSpam.get(word, 0) + 1
+                else:
+                    self.idfHam[word] = self.idfHam.get(word, 0) + 1
+
+    def TF_IDF(self):
+        self.probSpam = dict()
+        self.probHam = dict()
+        self.sumSpam = 0
+        self.sumHam = 0
+        for word in self.tfSpam:
+            self.probSpam[word] = (self.tfSpam[word]) * log((self.spam + self.ham) / (self.idfSpam[word] + self.idfHam.get(word, 0)))
+            self.sumSpam += self.probSpam[word]
+        for word in self.tfSpam:
+            self.probSpam[word] = (self.probSpam[word] + 1) / (self.sumSpam + len(list(self.probSpam.keys())))
+        for word in self.tfHam:
+            self.probHam[word] = (self.tfHam[word]) * log((self.spam + self.ham) / (self.idfSpam.get(word, 0) + self.idfHam[word]))
+            self.sumHam += self.probHam[word]
+        for word in self.tfHam:
+            self.probHam[word] = (self.probHam[word] + 1) / (self.sumHam + len(list(self.probHam.keys())))
+
+        self.probSpamTotal, self.probHamTotal = self.spam / self.total, self.ham / self.total
+
+    def classify(self, processed):
+        pSpam, pHam = 0, 0
+        for word in processed:
+            if word in self.probSpam:
+                pSpam += log(self.probSpam[word])
+            else:
+                pSpam -= log(self.sumSpam + len(list(self.probSpam.keys())))
+            if word in self.probHam:
+                pHam += log(self.probHam[word])
+            else:
+                pHam -= log(self.sumHam + len(list(self.probHam.keys())))
+            pSpam += log(self.probSpamTotal)
+            pHam += log(self.probHamTotal)
+        return pSpam >= pHam
+
+    def predict(self, testData):
+        result = dict()
+        for (i, tweet) in enumerate(testData):
+            processed = processTweet(tweet)
+            result[i] = int(self.classify(processed))
+        return result
+
+def metrics(labels, predictions):
+    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
+    for i in range(len(labels)):
+        true_pos += int(labels[i] == 1 and predictions[i] == 1)
+        true_neg += int(labels[i] == 0 and predictions[i] == 0)
+        false_pos += int(labels[i] == 0 and predictions[i] == 1)
+        false_neg += int(labels[i] == 1 and predictions[i] == 0)
+    HB_NB_Precision = true_pos / (true_pos + false_pos)
+    HB_NB_Recall = true_pos / (true_pos + false_neg)
+    HB_NB_F_Score = 2 * HB_NB_Precision * HB_NB_Recall / (HB_NB_Precision + HB_NB_Recall)
+    HB_NB_Accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
+
+    print("HB Precision: ", HB_NB_Precision)
+    print("HB Recall: ", HB_NB_Recall)
+    print("HB F-score: ", HB_NB_F_Score)
+    print("HB Accuracy: ", HB_NB_Accuracy)
+
+class filterSpam(object):
+    def __init__(self, training_set):
+        self.training_set = training_set
+
+    def trainFilter(self):
+        self.dataset()
+        self.train()
+
+    def dataset(self):
+        self.data = pd.read_csv(self.training_set)
+
+        self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
+
+        self.data.drop(['classes'], axis=1, inplace=True)
+
+        self.trainIndex, self.testIndex = list(), list()
+        for i in range(self.data.shape[0]):
+            if np.random.uniform(0, 1) < 0.75:
+                self.trainIndex += [i]
+            else:
+                self.testIndex += [i]
+        self.trainData = self.data.loc[self.trainIndex]
+        self.testData  = self.data.loc[self.testIndex]
+
+        self.trainData.reset_index(inplace=True)
+        self.testData.reset_index(inplace=True)
+        self.trainData.drop(['index'], axis=1, inplace=True)
+        self.testData.drop(['index'], axis=1, inplace=True)
+
+    def train(self):
+        self.spamFilter = classifier(self.trainData)
+        self.spamFilter.train()
+
+    def testData_Prediction(self):
+        prediction = self.spamFilter.predict(self.testData['tweet'])
+
+        return prediction
+
+    def testPrediction(self):
+
+        # Test Spam/Ham tweets - should return True and False respectivly
+        spam = processTweet("Earn more than 0015 btc free No deposit No investment Free Bitcoins - Earn $65 free btc in 5 minutes bitcoin freebtc getbtc")
+        ham = processTweet("Bitcoin closed with some gains in month of February")
+
+        hamTweet = self.spamFilter.classify(ham)
+        spamTweet = self.spamFilter.classify(spam)
+
+        print("Console: ", "Spam Tweet -- ", spamTweet)
+        sys.stdout.flush()
+        print("Console: ", "Ham Tweet -- ", hamTweet)
+        sys.stdout.flush()
+
+    def filterStatistics(self, prediction):
+        metrics(self.testData['class'], prediction)
+
+    def filterTweet(self, tweet):
+
+        processed = processTweet(tweet)
+
+        classified = self.spamFilter.classify(processed)
+
+        return classified
+
+class multinomialNaiveBayes(object):
+    def __init__(self, training_set):
+        self.training_set = training_set
+
+    def trainFilter(self):
+        self.dataset()
+        self.train()
+        self.predictTest()
+
+    def dataset(self):
+        self.data = pd.read_csv(self.training_set)
+
+        self.data.drop_duplicates(inplace = True)
+
+        self.data['class'] = self.data['classes'].map({'ham': 0, 'spam': 1})
+
+        self.cv = CountVectorizer(analyzer=processTweet)
+
+        messages_bow = self.cv.fit_transform(self.data['tweet'])
+
+        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(messages_bow, self.data['class'], test_size = 0.20, random_state = 0)
+
+    def train(self):
+        self.classifier = MultinomialNB()
+        self.classifier.fit(self.X_train, self.y_train)
+
+    def predictTest(self):
+        self.pred = self.classifier.predict(self.X_test)
+        print('Accuracy: ', accuracy_score(self.y_test, self.pred))
+
+    def predict(self, tweet):
+        message = self.cv.transform([tweet]).toarray()
+
+        return self.classifier.predict(message)
+
+class tweetFilter(object):
+    def __init__(self):
+        pass
+
+    def tweetFilterTrain(self):
+        self.Filter = multinomialNaiveBayes("src/resources/tweet_spam_ham.csv")
+
+        self.Filter.trainFilter()
+
+        ### Self coded NB get around 75 -> 85% accuracy ( not as good as SKlearns )
+        # Filter.trainFilter()
+        #
+        # prediction = Filter.testData_Prediction()
+        # Filter.filterStatistics(prediction)
+        #
+        # Filter.testPrediction()
+    def tweetFilterPredit(self, tweet):
+         df = pd.DataFrame(self.Filter.predict(tweet))
+         df[0] = df[0].map({0: 'ham', 1: 'spam'})
+         return df[0][0]