[21.02.20] Utility function extraction
This commit is contained in:
parent
11c29cc921
commit
1c2b58dcfc
@ -1,9 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os, re, sys
|
||||
import os, sys
|
||||
|
||||
from nltk import wordpunct_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
|
||||
@ -16,6 +14,8 @@ from pathlib import Path # python3 only
|
||||
env_path = Path('.') / 'configuration/twitter.env'
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
|
||||
from utils.tweetPreprocessing import *
|
||||
|
||||
class keys():
|
||||
|
||||
def __init__(self):
|
||||
|
||||
61
src/utils/tweetPreprocessing.py
Normal file
61
src/utils/tweetPreprocessing.py
Normal file
@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import re, sys
|
||||
|
||||
from nltk import wordpunct_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
# Provides list of unicode emojis for extraction
|
||||
import emoji as ji
|
||||
|
||||
def cleanTweet(text):
|
||||
# Function to clean tweets, removes links and special characters
|
||||
return re.sub(r'([^0-9A-Za-z \-\%\£\$ \t])|(@[A-Za-z0-9]+)|(http\S+)', '', text), ' '.join(c for c in text if c in ji.UNICODE_EMOJI)
|
||||
|
||||
def removeSpacing(text):
|
||||
return re.sub(r'( +)', ' ', text)
|
||||
|
||||
def fixLines(text):
|
||||
return re.sub(r"([\r\n])", " ", text)
|
||||
#return re.sub(r"(\w)([A-Z])", r"\1 \2", text)
|
||||
|
||||
def remove_non_ascii(text):
|
||||
return ''.join(i for i in text if ord(i)<128)
|
||||
|
||||
def detectLaguage(text):
|
||||
"""
|
||||
Calculate the probability of given text is written in several languages
|
||||
Using nltk stopwords and comparing to all supported languages
|
||||
There are other ways to identify this - TextBlob.detect_language and Ngrams
|
||||
"""
|
||||
|
||||
language_ratios = {}
|
||||
tokens = wordpunct_tokenize(text)
|
||||
words = [word.lower() for word in tokens]
|
||||
|
||||
# Compute per language in nltk number of stopwords in text
|
||||
for language in stopwords.fileids():
|
||||
stopwords_set = set(stopwords.words(language))
|
||||
words_set = set(words)
|
||||
common_elements = words_set.intersection(stopwords_set)
|
||||
|
||||
language_ratios[language] = len(common_elements) # Ratio scores
|
||||
|
||||
ratios = language_ratios
|
||||
|
||||
highest_ratio = max(ratios, key=ratios.get)
|
||||
|
||||
print("Console: Text is - ", highest_ratio)
|
||||
sys.stdout.flush()
|
||||
|
||||
if highest_ratio == 'english':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def checkLength(text):
|
||||
tokens = text.split()
|
||||
if len(tokens) <= 5:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
Loading…
x
Reference in New Issue
Block a user