[21.02.20] Utility function extraction

2020-02-21 16:45:26 +00:00 · 2020-02-21 16:45:26 +00:00 · 1c2b58dcfc
commit 1c2b58dcfc
parent 11c29cc921
2 changed files with 64 additions and 3 deletions
--- a/src/tweets/collector.py
+++ b/src/tweets/collector.py
@ -1,9 +1,7 @@
 #!/usr/bin/env python
-import os, re, sys
+import os, sys
 from nltk import wordpunct_tokenize
 from nltk.corpus import stopwords
 from datetime import datetime, timedelta
 import time
@ -16,6 +14,8 @@ from pathlib import Path  # python3 only
 env_path = Path('.') / 'configuration/twitter.env'
 load_dotenv(dotenv_path=env_path)
 from utils.tweetPreprocessing import *
 class keys():
    def __init__(self):
--- a/src/utils/tweetPreprocessing.py
+++ b/src/utils/tweetPreprocessing.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 import re, sys
 from nltk import wordpunct_tokenize
 from nltk.corpus import stopwords
 # Provides list of unicode emojis for extraction
 import emoji as ji
 def cleanTweet(text):
    # Function to clean tweets, removes links and special characters
    return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
 def removeSpacing(text):
    return re.sub(r'( +)', ' ', text)
 def fixLines(text):
    return re.sub(r"([\r\n])", " ", text)
    #return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
 def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)
 def detectLaguage(text):
    """
    Calculate the probability of given text is written in several languages
    Using nltk stopwords and comparing to all supported languages
    There are other ways to identify this - TextBlob.detect_language and Ngrams
    """
    language_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]
    # Compute per language in nltk number of stopwords in text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        language_ratios[language] = len(common_elements) # Ratio scores
    ratios = language_ratios
    highest_ratio = max(ratios, key=ratios.get)
    print("Console: Text is - ", highest_ratio)
    sys.stdout.flush()
    if highest_ratio == 'english':
        return True
    else:
        return False
 def checkLength(text):
    tokens = text.split()
    if len(tokens) <= 5:
        return False
    else:
        return True