From 1c2b58dcfc0ed899d40c911c6c7fbd3bd7b83b3f Mon Sep 17 00:00:00 2001
From: andrewso <9V5f1FkzI2LD>
Date: Fri, 21 Feb 2020 16:45:26 +0000
Subject: [PATCH] [21.02.20] Utility function extraction

---
 src/tweets/collector.py         |  6 ++--
 src/utils/tweetPreprocessing.py | 61 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 src/utils/tweetPreprocessing.py

diff --git a/src/tweets/collector.py b/src/tweets/collector.py
index 855f202..99e6cd1 100644
--- a/src/tweets/collector.py
+++ b/src/tweets/collector.py
@@ -1,9 +1,7 @@
 #!/usr/bin/env python
 
-import os, re, sys
+import os, sys
 
-from nltk import wordpunct_tokenize
-from nltk.corpus import stopwords
 from datetime import datetime, timedelta
 import time
 
@@ -16,6 +14,8 @@ from pathlib import Path  # python3 only
 env_path = Path('.') / 'configuration/twitter.env'
 load_dotenv(dotenv_path=env_path)
 
+from utils.tweetPreprocessing import *
+
 class keys():
 
     def __init__(self):
diff --git a/src/utils/tweetPreprocessing.py b/src/utils/tweetPreprocessing.py
new file mode 100644
index 0000000..3ad53a4
--- /dev/null
+++ b/src/utils/tweetPreprocessing.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import re, sys
+
+from nltk import wordpunct_tokenize
+from nltk.corpus import stopwords
+
+# Provides list of unicode emojis for extraction
+import emoji as ji
+
+def cleanTweet(text):
+    # Function to clean tweets, removes links and special characters
+    return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
+
+def removeSpacing(text):
+    return re.sub(r'( +)', ' ', text)
+
+def fixLines(text):
+    return re.sub(r"([\r\n])", " ", text)
+    #return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
+
+def remove_non_ascii(text):
+    return ''.join(i for i in text if ord(i)<128)
+
+def detectLaguage(text):
+    """
+    Calculate the probability of given text is written in several languages
+    Using nltk stopwords and comparing to all supported languages
+    There are other ways to identify this - TextBlob.detect_language and Ngrams
+    """
+
+    language_ratios = {}
+    tokens = wordpunct_tokenize(text)
+    words = [word.lower() for word in tokens]
+
+    # Compute per language in nltk number of stopwords in text
+    for language in stopwords.fileids():
+        stopwords_set = set(stopwords.words(language))
+        words_set = set(words)
+        common_elements = words_set.intersection(stopwords_set)
+
+        language_ratios[language] = len(common_elements) # Ratio scores
+
+    ratios = language_ratios
+
+    highest_ratio = max(ratios, key=ratios.get)
+
+    print("Console: Text is - ", highest_ratio)
+    sys.stdout.flush()
+
+    if highest_ratio == 'english':
+        return True
+    else:
+        return False
+
+def checkLength(text):
+    tokens = text.split()
+    if len(tokens) <= 5:
+        return False
+    else:
+        return True
\ No newline at end of file