[21.02.20] Utility function extraction

This commit is contained in:
andrewso 2020-02-21 16:45:26 +00:00
parent 11c29cc921
commit 1c2b58dcfc
2 changed files with 64 additions and 3 deletions

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import os, re, sys import os, sys
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from datetime import datetime, timedelta from datetime import datetime, timedelta
import time import time
@ -16,6 +14,8 @@ from pathlib import Path # python3 only
env_path = Path('.') / 'configuration/twitter.env' env_path = Path('.') / 'configuration/twitter.env'
load_dotenv(dotenv_path=env_path) load_dotenv(dotenv_path=env_path)
from utils.tweetPreprocessing import *
class keys(): class keys():
def __init__(self): def __init__(self):

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
import re, sys
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
# Provides list of unicode emojis for extraction
import emoji as ji
def cleanTweet(text):
# Function to clean tweets, removes links and special characters
return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
def removeSpacing(text):
return re.sub(r'( +)', ' ', text)
def fixLines(text):
return re.sub(r"([\r\n])", " ", text)
#return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
def remove_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
def detectLaguage(text):
"""
Calculate the probability of given text is written in several languages
Using nltk stopwords and comparing to all supported languages
There are other ways to identify this - TextBlob.detect_language and Ngrams
"""
language_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language in nltk number of stopwords in text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
language_ratios[language] = len(common_elements) # Ratio scores
ratios = language_ratios
highest_ratio = max(ratios, key=ratios.get)
print("Console: Text is - ", highest_ratio)
sys.stdout.flush()
if highest_ratio == 'english':
return True
else:
return False
def checkLength(text):
tokens = text.split()
if len(tokens) <= 5:
return False
else:
return True