[21.02.20] Utility function extraction

2020-02-21 16:45:26 +00:00 · 2020-02-21 16:45:26 +00:00 · 1c2b58dcfc
commit 1c2b58dcfc
parent 11c29cc921
2 changed files with 64 additions and 3 deletions
--- a/src/tweets/collector.py
+++ b/src/tweets/collector.py
@ -1,9 +1,7 @@
 #!/usr/bin/env python

-import os, re, sys
+import os, sys

-from nltk import wordpunct_tokenize
-from nltk.corpus import stopwords
 from datetime import datetime, timedelta
 import time

@ -16,6 +14,8 @@ from pathlib import Path  # python3 only
 env_path = Path('.') / 'configuration/twitter.env'
 load_dotenv(dotenv_path=env_path)

+from utils.tweetPreprocessing import *
+
 class keys():

    def __init__(self):
--- a/src/utils/tweetPreprocessing.py
+++ b/src/utils/tweetPreprocessing.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import re, sys
+
+from nltk import wordpunct_tokenize
+from nltk.corpus import stopwords
+
+# Provides list of unicode emojis for extraction
+import emoji as ji
+
+def cleanTweet(text):
+    # Function to clean tweets, removes links and special characters
+    return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
+
+def removeSpacing(text):
+    return re.sub(r'( +)', ' ', text)
+
+def fixLines(text):
+    return re.sub(r"([\r\n])", " ", text)
+    #return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
+
+def remove_non_ascii(text):
+    return ''.join(i for i in text if ord(i)<128)
+
+def detectLaguage(text):
+    """
+    Calculate the probability of given text is written in several languages
+    Using nltk stopwords and comparing to all supported languages
+    There are other ways to identify this - TextBlob.detect_language and Ngrams
+    """
+
+    language_ratios = {}
+    tokens = wordpunct_tokenize(text)
+    words = [word.lower() for word in tokens]
+
+    # Compute per language in nltk number of stopwords in text
+    for language in stopwords.fileids():
+        stopwords_set = set(stopwords.words(language))
+        words_set = set(words)
+        common_elements = words_set.intersection(stopwords_set)
+
+        language_ratios[language] = len(common_elements) # Ratio scores
+
+    ratios = language_ratios
+
+    highest_ratio = max(ratios, key=ratios.get)
+
+    print("Console: Text is - ", highest_ratio)
+    sys.stdout.flush()
+
+    if highest_ratio == 'english':
+        return True
+    else:
+        return False
+
+def checkLength(text):
+    tokens = text.split()
+    if len(tokens) <= 5:
+        return False
+    else:
+        return True