[15.10.20] Testing

2020-10-15 20:19:39 +01:00 · 2020-10-15 20:15:55 +01:00 · 2020-10-15 18:10:57 +01:00 · 2020-10-15 17:19:44 +01:00 · 2020-10-15 14:01:06 +01:00 · 2020-10-15 11:31:50 +01:00
7 changed files with 238 additions and 62 deletions
--- a/configuration/kubernetes/deployment.yaml
+++ b/configuration/kubernetes/deployment.yaml
@ -119,10 +119,10 @@ spec:
        imagePullPolicy: Always
        resources:
          requests:
-            cpu: 64m
+            cpu: 100m
            memory: 64Mi
          limits:
-            cpu: 150m
+            cpu: 500m
            memory: 256Mi
        securityContext:
          capabilities:
--- a/src/tweets/collector.py
+++ b/src/tweets/collector.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-import os, sys, json
+import os, sys, json, uuid

 from datetime import datetime, timedelta
 from time import sleep, time
@ -17,12 +17,14 @@ from src.utils.spamFilter import callSpamFilter
 from src.utils.sentimentAnalyser import callSentimentAnalyser
 from src.utils.activemqConnect import activeMQSender
 from src.utils.jsonLogger import log
+from src.utils.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords

 from http.client import IncompleteRead
 from urllib3.exceptions import ProtocolError

 hourStack = []
 dumpStack = []
+processStack = []

 class keys():

@ -32,20 +34,20 @@ class keys():
        self.access_token = os.getenv("ACCESS_TOKEN")
        self.access_secret = os.getenv("ACCESS_SECRET")

-def sendToArtemis(pos, neu, neg, compound, type):
+def sendToArtemis(syncId, pos, neu, neg, compound, type):

    timestamp = datetime.now() + timedelta(hours=1)
    strippedTimestamp = timestamp.replace(minute=0, second=0, microsecond=0)
    timestamp = strippedTimestamp.strftime('%Y-%m-%dT%H:%M:%S')

-    message = { "timestamp" : timestamp, "pos" : pos, "neu" : neu, "neg" : neg, "compound" : compound, "type": type }
+    message = { "timestamp" : timestamp, "syncId": str(syncId), "pos" : pos, "neu" : neu, "neg" : neg, "compound" : compound, "type": type }

    messageJson = json.dumps(message, indent = 4)

-    log("Sending message to TweetSave queue", 'INFO')
+    log("Sending message to TweetSave queue for SyncId [{}]".format(syncId), 'INFO')
    log("Message: {}".format(message), 'INFO')

-    activeMQSender(messageJson)
+    activeMQSender(messageJson, syncId)

 class Streamer():

@ -54,6 +56,7 @@ class Streamer():

    def stream_tweets(self, hashtag):
        listener = Listener(hashtag)
+
        auth = OAuthHandler(keys().api_key, keys().api_secret)

        log("Authorising with twitter API...", 'INFO')
@ -66,17 +69,13 @@ class Streamer():

        while True:
            try:
-                stream = Stream(auth=api.auth, listener=listener, tweet_mode='extended')
+                stream = Stream(auth, listener=listener, tweet_mode='extended')
                stream.filter(languages=["en"], track=hashtag)
            except IncompleteRead:
                log("Incomplete Read Error", 'ERR')
-                stream = Stream(auth=api.auth, listener=listener, tweet_mode='extended')
-                stream.filter(languages=["en"], track=hashtag)
                continue
            except ProtocolError:
                log("Protocol Error", 'ERR')
-                stream = Stream(auth=api.auth, listener=listener, tweet_mode='extended')
-                stream.filter(languages=["en"], track=hashtag)
                continue

 class Listener(StreamListener):
@ -91,61 +90,68 @@ class Listener(StreamListener):
        if (time() - self.start_time) < self.limit:
            data = json.loads(data)

+            log(len(dumpStack), 'INFO')
+
            # Check if tweet is a retweet
            if 'retweeted_status' in data:
                if 'extended_tweet' in data['retweeted_status']:
                    #if tweet is over the 140 word limit
                    text = data['retweeted_status']['extended_tweet']['full_text']
-                    processTweet(text, self.hashtag)
-                    # dumpStack.append({'type': self.hashtag, 'tweet': text})
+                    text = filterOutTweetsWithNoneWhitelistedWords(text)
+                    dumpStack.append({'type': self.hashtag, 'tweet': text})
                else:
                    text = data['retweeted_status']['text']
-                    processTweet(text, self.hashtag)
-                    # dumpStack.append({'type': self.hashtag, 'tweet': text})
+                    text = filterOutTweetsWithNoneWhitelistedWords(text)
+                    dumpStack.append({'type': self.hashtag, 'tweet': text})
            else:
                # Else if a normal Tweeet
                if 'extended_tweet' in data:
                    # If tweet is over 140 word limit
                    text = data['extended_tweet']['full_text']
-                    processTweet(text, self.hashtag)
-                    # dumpStack.append({'type': self.hashtag, 'tweet': text})
+                    text = filterOutTweetsWithNoneWhitelistedWords(text)
+                    dumpStack.append({'type': self.hashtag, 'tweet': text})

-def processTweet(text, type):
+def processTweet(syncId):

-    log("hourStack size :: [{}]".format(len(hourStack)), "INFO")
-    # processStack = dumpStack.copy()
-    # dumpStack.clear()
+    log(len(dumpStack), 'INFO')

-    # log("Processing [{}] Tweets...".format(len(processStack)), 'INFO')
+    processStack = dumpStack.copy()
+    dumpStack.clear()

-    # if len(processStack) != 0:
-    #     for tweet in processStack:
-    removedLines = fixLines(text)
-    removedSpecialChars = cleanTweet(removedLines)
-    removedSpacing = removeSpacing(removedSpecialChars[0])
-    tweetLength = checkLength(removedSpacing)
-    if tweetLength == True:
+    # log("Processing [{}] Tweet...".format(text), 'INFO')

-        checkIfEnglish = detectLaguage(removedSpecialChars[0])
+    if len(processStack) != 0:
+        for tweet in processStack:

-        if checkIfEnglish == True:
+            removedLines = fixLines(tweet["tweet"])

-            tweetText = remove_non_ascii(removedSpacing)
+            removedSpecialChars = cleanTweet(removedLines)
+            removedSpacing = removeSpacing(removedSpecialChars[0])

-            # log("Cleaned Tweet: {}".format(tweetText), 'INFO')
+            tweetLength = checkLength(removedSpacing)
+            if tweetLength == True:

-            cleanedTweet = tweetText + ' ' + removedSpecialChars[1]
+                checkIfEnglish = detectLaguage(removedSpecialChars[0])

-            if callSpamFilter(cleanedTweet) != 'spam':
-                pos, neu, neg, compound = callSentimentAnalyser(cleanedTweet)
+                if checkIfEnglish == True:

-                if compound != 0.0 | neu <= 0.8:
-                    hourTweet = {'pos': pos, 'neu': neu, 'neg': neg, 'compound': compound, 'type': type}
+                    tweetText = remove_non_ascii(removedSpacing)

-                    hourStack.append(hourTweet)
-    #     processStack.clear()
-    # else:
-    #     log("Dump Stack was Empty", 'WARN')
+                    # log("Cleaned Tweet: {}".format(tweetText), 'INFO')
+
+                    cleanedTweet = tweetText + ' ' + removedSpecialChars[1]
+
+                    if callSpamFilter(cleanedTweet, syncId) != 'spam':
+
+                        pos, neu, neg, compound = callSentimentAnalyser(cleanedTweet, syncId)
+
+                        if compound != 0.0 and neu <= 0.6:
+                            hourTweet = {'pos': pos, 'neu': neu, 'neg': neg, 'compound': compound, 'type': tweet["type"]}
+
+                            hourStack.append(hourTweet)
+        processStack.clear()
+    else:
+        log("Dump Stack was Empty", 'WARN')

 def collector(hashtag):

@ -171,17 +177,23 @@ def createHourJob():
    log("Creating hour job...", 'INFO')
    schedule.clear("sendToArtemis")
    ovPos, ovNeu, ovNeg, ovCompound = 0, 0, 0, 0
+    type = ""

    global timeF
    timeF = timeFunction()

+    syncId = uuid.uuid4()
+
+    processTweet(syncId)
+
    processStack = hourStack.copy()
+    hourStack.clear()

    log("Extracting sentiment scores...", 'INFO')

    if len(processStack) != 0:
        log("Process stack size is :: [{}]".format(len(processStack)), 'INFO')
-        for item in hourStack:
+        for item in processStack:
            ovPos = ovPos + item['pos']
            ovNeu = ovNeu + item['neu']
            ovNeg = ovNeg + item['neg']
@ -196,9 +208,9 @@ def createHourJob():
        if type == "bitcoin":
            type = 'btc_usd'

-        hourStack.clear()
+        processStack.clear()

-        sendToArtemis(pos, neu, neg, compound, type)
+        sendToArtemis(syncId, pos, neu, neg, compound, type)
    else:
        log("Stack is empty", 'WARN')

@ -212,6 +224,7 @@ def collectorMain(hashtag):
    for i in range(len(hashtag)):
        Thread(target=collector, args=[hashtag[i]]).start()

+    sleep(2)
    createHourJob()

    while True:
--- a/src/utils/activemqConnect.py
+++ b/src/utils/activemqConnect.py
@ -17,14 +17,20 @@ class keys():
    def returnKeys(self):
        return self.addr, self.port, self.amqU, self.amqP

-def activeMQSender(message):
+def activeMQSender(message, syncId):
    addr, port, mqUser, mqPass = keys().returnKeys()

    log("Attempting Connection to Artemis...", 'INFO')
    con = stomp.Connection([(addr, port)], auto_content_length=False)
    con.connect( mqUser, mqPass, wait=True)

-    con.send("TweetSave", message, content_type="application/json", headers={"Content-Type":"application/json"})
+    con.send("TweetSave",
+             message,
+             content_type="application/json",
+             headers={
+                 "Content-Type":"application/json",
+                 "X-CRYPTO-Sync-ID":syncId
+             })

    con.disconnect()

--- a/src/utils/jsonLogger.py
+++ b/src/utils/jsonLogger.py
@ -28,13 +28,13 @@ def setup_logging(log_level='INFO'):
    logHandler.setFormatter(formatter)
    logger.addHandler(logHandler)

-def log(message, level):
+def log(message, level, syncId=""):
    logger = logging.getLogger(__name__)
    if level == 'INFO':
-        logger.info(message)
+        logger.info(message, extra={"X-CRYPTO-Sync-ID" : syncId})
    elif level == 'WARN':
-        logger.warn(message)
+        logger.warn(message, extra={"X-CRYPTO-Sync-ID" : syncId})
    elif level == 'ERR':
-        logger.error(message)
+        logger.error(message, extra={"X-CRYPTO-Sync-ID" : syncId})
    elif level == 'DEBUG':
-        logger.debug(message)
+        logger.debug(message, extra={"X-CRYPTO-Sync-ID" : syncId})
--- a/src/utils/sentimentAnalyser.py
+++ b/src/utils/sentimentAnalyser.py
@ -9,11 +9,14 @@ class keys():
    def __init__(self):
        self.sentiment_analyser_uri = os.getenv("SENTIMENT_URL")

-def callSentimentAnalyser(tweet):
-    # log("Calling Sentiment Analyser for [{}]".format(tweet), 'INFO')
+def callSentimentAnalyser(tweet, syncId):
+    headers = {
+        "content-type":"text",
+        "X-CRYPTO-Sync-ID" : str(syncId)
+    }
    try:
        uri = keys().sentiment_analyser_uri + "/sentiment?tweet="+tweet
-        response = requests.request("GET", uri)
+        response = requests.request("GET", url=uri, headers=headers)

        response = json.loads(response.text)

@ -21,5 +24,5 @@ def callSentimentAnalyser(tweet):

        return scores["pos"], scores["neu"], scores["neg"], scores["compound"]
    except:
-        log("Could not call Sentiment Analyser Service", 'ERR')
+        log("Could not call Sentiment Analyser Service with syncId of [{}]".format(syncId), 'ERR', syncId)
        return 0, 0, 0, 0
--- a/src/utils/spamFilter.py
+++ b/src/utils/spamFilter.py
@ -9,16 +9,18 @@ class keys():
    def __init__(self):
        self.spamFilter_uri = os.getenv("FILTER_URL")

-def callSpamFilter(tweet):
+def callSpamFilter(tweet, syncId):
+    headers = {
+        "content-type":"text",
+        "X-CRYPTO-Sync-ID" : str(syncId)
+    }
    try:
        uri = keys().spamFilter_uri + "/predict?tweet="+tweet
-        response = requests.request("GET", uri)
+        response = requests.request("GET", url=uri, headers=headers)

        response = json.loads(response.text)

-        # log("Spam Filter result for [{}] is [{}]".format(tweet, response["result"]), 'INFO')
-
        return response["result"]
    except:
-        log("Could not call spam filter service", 'ERR')
+        log("Could not call spam filter service with syncId of [{}]".format(syncId), 'ERR', syncId)
        return ""
--- a/src/utils/whitelistedWords.py
+++ b/src/utils/whitelistedWords.py
@ -0,0 +1,152 @@
+#!/usr/bin/env python
+
+whitelist = [
+    "bull",
+    "bear",
+    "bullish",
+    "bearish",
+    "up",
+    "down",
+    "high",
+    "low",
+    "higher",
+    "lower",
+    "absconded",
+    "maximalists",
+    "regulate",
+    "infamous",
+    "tradehigher",
+    "tradelower",
+    "revival",
+    "centralized",
+    "decentralized",
+    "centralised",
+    "decentralised",
+    "decentralization",
+    "decentralisation",
+    "centralization",
+    "centralisation",
+    "bans",
+    "hodl",
+    "ambiguity",
+    "revolutionize",
+    "revolutionise",
+    "consolidation",
+    "shorts",
+    "longs",
+    "long",
+    "short",
+    "shorting",
+    "grow",
+    "volatile",
+    "rally",
+    "rallying",
+    "noob",
+    "noobs",
+    "innovation",
+    "bottom",
+    "top",
+    "topped",
+    "bottomed",
+    "upwards",
+    "downwards",
+    "invest",
+    "raging",
+    "rocketing",
+    "swing",
+    "swinging",
+    "stake",
+    "whale",
+    "whales",
+    "lull",
+    "moon",
+    "choppy",
+    "buy",
+    "buying",
+    "sell",
+    "selling",
+    "startselling",
+    "stopselling",
+    "startbuying",
+    "stopbuying",
+    "bitcoin",
+    "btc",
+    "eth",
+    "xmr",
+    "xrp",
+    "ripple",
+    "block",
+    "reward",
+    "airdrop",
+    "drop",
+    "raise",
+    "stack",
+    "stake",
+    "invest",
+    "pull",
+    "push",
+    "token",
+    "sale",
+    "unhappy",
+    "happy",
+    "expert",
+    "novice"
+    "passed",
+    "mark",
+    "decline",
+    "incline",
+    "fees",
+    "crypto",
+    "wallet",
+    "price",
+    "history",
+    "reached",
+    "upward",
+    "downward",
+    "trading",
+    "mining",
+    "defi",
+    "finance",
+    "blockchain",
+    "interest",
+    "alt",
+    "alts",
+    "fiat",
+    "fiat",
+    "currency",
+    "currencies",
+    "wealth",
+    "hype",
+    "hyped",
+    "achievement",
+    "platform",
+    "incremental",
+    "increment",
+    "decrement",
+    "decremental",
+    "success",
+    "loss",
+    "win",
+    "lose",
+    "worth",
+    "strongest",
+    "weakest",
+    "strong",
+    "weak",
+    "trade",
+    "popping",
+    "sucking",
+    "shard",
+    "sharding",
+    "industry",
+    "powerful",
+    "better",
+    "worse"
+]
+
+def filterOutTweetsWithNoneWhitelistedWords(text):
+    if any(x in text for x in whitelist):
+        return text
+    else:
+        # log("Tweet [{}] did not contain any keywords for it to be considered crypto related".format(text), 'WARN')
+        return ""
Author	SHA1	Message	Date
andrewso	b3f8ba4cec	[15.10.20] Testing	2020-10-15 20:19:39 +01:00
andrewso	da43817926	[15.10.20] Testing	2020-10-15 20:15:55 +01:00
andrewso	17bd1399cc	[15.10.20] Testing	2020-10-15 18:10:57 +01:00
andrewso	7325a509bd	[15.10.20] Testing	2020-10-15 17:19:44 +01:00
andrewso	affd25379a	[15.10.20] Testing	2020-10-15 14:01:06 +01:00
andrewso	0106ec9044	[15.10.20] Testing	2020-10-15 11:31:50 +01:00
andrewso	5c832a70fe	[15.10.20] Testing	2020-10-15 11:20:50 +01:00
andrewso	3c25310d76	[14.10.20] Testing	2020-10-14 18:51:36 +01:00
andrewso	6626c0864b	[14.10.20] Testing	2020-10-14 18:11:10 +01:00
andrewso	1d459420de	[14.10.20] Testing	2020-10-14 13:17:16 +01:00
andrewso	fba738d690	[14.10.20] Testing	2020-10-14 12:04:08 +01:00
andrewso	73191de94f	[14.10.20] Testing - Limits	2020-10-14 12:00:30 +01:00
andrewso	8e15f8c9f8	[14.10.20] Testing	2020-10-14 09:48:40 +01:00
andrewso	ab669a1ed7	[13.10.20] Testing	2020-10-13 09:36:20 +01:00
andrewso	9baf56f267	[13.10.20] Testing	2020-10-13 09:31:39 +01:00
andrewso	50e2ef40fd	[12.10.20] Testing	2020-10-12 18:05:18 +01:00
andrewso	fe90e08191	[12.10.20] Testing	2020-10-12 17:58:59 +01:00
andrewso	f11b94df52	[12.10.20] Testing	2020-10-12 17:48:57 +01:00
andrewso	ab1288351c	[12.10.20] Testing	2020-10-12 17:41:15 +01:00
andrewso	e2df32e5d6	[12.10.20] Testing	2020-10-12 17:30:37 +01:00
andrewso	dea841345c	[12.10.20] Testing	2020-10-12 17:27:48 +01:00
andrewso	73c4a171ab	[12.10.20] Testing	2020-10-12 17:26:06 +01:00