From 1c2b58dcfc0ed899d40c911c6c7fbd3bd7b83b3f Mon Sep 17 00:00:00 2001 From: andrewso <9V5f1FkzI2LD> Date: Fri, 21 Feb 2020 16:45:26 +0000 Subject: [PATCH] [21.02.20] Utility function extraction --- src/tweets/collector.py | 6 ++-- src/utils/tweetPreprocessing.py | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 src/utils/tweetPreprocessing.py diff --git a/src/tweets/collector.py b/src/tweets/collector.py index 855f202..99e6cd1 100644 --- a/src/tweets/collector.py +++ b/src/tweets/collector.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -import os, re, sys +import os, sys -from nltk import wordpunct_tokenize -from nltk.corpus import stopwords from datetime import datetime, timedelta import time @@ -16,6 +14,8 @@ from pathlib import Path # python3 only env_path = Path('.') / 'configuration/twitter.env' load_dotenv(dotenv_path=env_path) +from utils.tweetPreprocessing import * + class keys(): def __init__(self): diff --git a/src/utils/tweetPreprocessing.py b/src/utils/tweetPreprocessing.py new file mode 100644 index 0000000..3ad53a4 --- /dev/null +++ b/src/utils/tweetPreprocessing.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +import re, sys + +from nltk import wordpunct_tokenize +from nltk.corpus import stopwords + +# Provides list of unicode emojis for extraction +import emoji as ji + +def cleanTweet(text): + # Function to clean tweets, removes links and special characters + return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI) + +def removeSpacing(text): + return re.sub(r'( +)', ' ', text) + +def fixLines(text): + return re.sub(r"([\r\n])", " ", text) + #return re.sub(r"(\w)([A-Z])", r"\1 \2", text) + +def remove_non_ascii(text): + return ''.join(i for i in text if ord(i)<128) + +def detectLaguage(text): + """ + Calculate the probability of given text is written in several languages + Using nltk stopwords and comparing to all supported languages + There are other ways to identify this - TextBlob.detect_language and Ngrams + """ + + language_ratios = {} + tokens = wordpunct_tokenize(text) + words = [word.lower() for word in tokens] + + # Compute per language in nltk number of stopwords in text + for language in stopwords.fileids(): + stopwords_set = set(stopwords.words(language)) + words_set = set(words) + common_elements = words_set.intersection(stopwords_set) + + language_ratios[language] = len(common_elements) # Ratio scores + + ratios = language_ratios + + highest_ratio = max(ratios, key=ratios.get) + + print("Console: Text is - ", highest_ratio) + sys.stdout.flush() + + if highest_ratio == 'english': + return True + else: + return False + +def checkLength(text): + tokens = text.split() + if len(tokens) <= 5: + return False + else: + return True \ No newline at end of file