[21.02.20] Utility function extraction
This commit is contained in:
parent
11c29cc921
commit
1c2b58dcfc
@ -1,9 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, re, sys
|
import os, sys
|
||||||
|
|
||||||
from nltk import wordpunct_tokenize
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -16,6 +14,8 @@ from pathlib import Path # python3 only
|
|||||||
env_path = Path('.') / 'configuration/twitter.env'
|
env_path = Path('.') / 'configuration/twitter.env'
|
||||||
load_dotenv(dotenv_path=env_path)
|
load_dotenv(dotenv_path=env_path)
|
||||||
|
|
||||||
|
from utils.tweetPreprocessing import *
|
||||||
|
|
||||||
class keys():
|
class keys():
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
61
src/utils/tweetPreprocessing.py
Normal file
61
src/utils/tweetPreprocessing.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re, sys
|
||||||
|
|
||||||
|
from nltk import wordpunct_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
# Provides list of unicode emojis for extraction
|
||||||
|
import emoji as ji
|
||||||
|
|
||||||
|
def cleanTweet(text):
|
||||||
|
# Function to clean tweets, removes links and special characters
|
||||||
|
return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
|
||||||
|
|
||||||
|
def removeSpacing(text):
|
||||||
|
return re.sub(r'( +)', ' ', text)
|
||||||
|
|
||||||
|
def fixLines(text):
|
||||||
|
return re.sub(r"([\r\n])", " ", text)
|
||||||
|
#return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
|
||||||
|
|
||||||
|
def remove_non_ascii(text):
|
||||||
|
return ''.join(i for i in text if ord(i)<128)
|
||||||
|
|
||||||
|
def detectLaguage(text):
|
||||||
|
"""
|
||||||
|
Calculate the probability of given text is written in several languages
|
||||||
|
Using nltk stopwords and comparing to all supported languages
|
||||||
|
There are other ways to identify this - TextBlob.detect_language and Ngrams
|
||||||
|
"""
|
||||||
|
|
||||||
|
language_ratios = {}
|
||||||
|
tokens = wordpunct_tokenize(text)
|
||||||
|
words = [word.lower() for word in tokens]
|
||||||
|
|
||||||
|
# Compute per language in nltk number of stopwords in text
|
||||||
|
for language in stopwords.fileids():
|
||||||
|
stopwords_set = set(stopwords.words(language))
|
||||||
|
words_set = set(words)
|
||||||
|
common_elements = words_set.intersection(stopwords_set)
|
||||||
|
|
||||||
|
language_ratios[language] = len(common_elements) # Ratio scores
|
||||||
|
|
||||||
|
ratios = language_ratios
|
||||||
|
|
||||||
|
highest_ratio = max(ratios, key=ratios.get)
|
||||||
|
|
||||||
|
print("Console: Text is - ", highest_ratio)
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
if highest_ratio == 'english':
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def checkLength(text):
|
||||||
|
tokens = text.split()
|
||||||
|
if len(tokens) <= 5:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
Loading…
x
Reference in New Issue
Block a user