diff --git a/src/resources/tweet_spam_ham.csv b/src/resources/tweet_spam_ham.csv index 9f73d9b..ad3b80e 100644 --- a/src/resources/tweet_spam_ham.csv +++ b/src/resources/tweet_spam_ham.csv @@ -2847,7 +2847,7 @@ spam,Prices in USD BTC 6 445 54 / 5 602 20BCH 316 17 / 273 06BTG 21 49 / 18 56DG spam,1H2019/05/08 2100 2019/05/08 2000LONG 18384 57 BTC +85 39 BTCSHORT 32423 37 BTC -23 78 BTCLS 36% vs 63% 36% vs 63% spam,1 $BTC 418 Tweets2 $TRX 194 Tweets3 $DOGE 70 Tweets4 $C20 57 Tweets5 $XRP 35 Tweets2019-05-09 0800 2019-05-09 0859COINTREND Twitter jp/ ham,$BTC I see some sub penny traders now jumping up and down cause $BTC is at $6000 00 The key level psychologically is $10 000 00 I'll Feel a little more at ease when this level starts to consolidate -spam,Doviz-------------------USD 6 1839EUR 6 9272GBP 8 0460--------------------------------------BTC-------------------Gobaba 40563 17BtcTurk 36697 00Koinim 36669 00Paribu 36700 00Koineks 37149 99 +spam,Doviz USD 6 1839EUR 6 9272GBP 8 0460 BTC Gobaba 40563 17BtcTurk 36697 00Koinim 36669 00Paribu 36700 00Koineks 37149 99 spam,$BTCUSD 1 Hour UPTREND Continued 09-May-2019 004737 UTC $BTC cryptosignal crypto cryptocurrency bitcoin spam,2019/05/09 0900BTC 653707ETH 18667 6ETC 623 7BCH 31291 6XRP 32 7XEM 5 5LSK 185 8MONA 105 1 Bitcoin bitFlyer Coincheck spam,INVEST $1 BITCOIN OPPORTUNITY WORLDWIDE ? $1 and SET n Forget ! - $1 00 Find Out More htmlClassifiedAds INVEST BITCOIN OPPORTUNITY WORLDWIDE @@ -2860,7 +2860,7 @@ ham,**** Crypto Alert Breaking****Bitcoin BREAKS $6 000 00! ! BTC is currently t spam,The largest exchange premiums & discounts seen by CrossCoinCo at 0015 UTC BTC vs BRL on Negociecoins is trading at a 5 3% premium to USD markets ETH vs UAH on Exmo is trading at a 4 2% discount to USD markets Visit spam,BTC $5982 46 Up +$0 26 +0 00% in the last hour bitcoin bitsmart spam,1666 761 1874May 9 2019 000900 UTC bitcoin -spam,Doviz-------------------USD 6 1839EUR 6 9272GBP 8 0460--------------------------------------BTC-------------------Gobaba 40404 89BtcTurk 36493 00Koinim 36669 00Paribu 36450 00Koineks 36999 99 +spam,Doviz USD 6 1839EUR 6 9272GBP 8 0460 BTC Gobaba 40404 89BtcTurk 36493 00Koinim 36669 00Paribu 36450 00Koineks 36999 99 ham,Hurry! 10 42% direct arbitrage in WAX If you buy WAX in BTC market from CoinDCX and sell it on Livecoin in BTC market you can make a maximum profit of 0 00 spam,Bitcoin $6 029 08 v BitcoinCash $429 87 BTC/BCH 14 0 Avg Transaction fee for Bitcoin ~$1 66 v BitcoinCash ~$0 00 - 2019/05/09 0900JST ham,BTCUSD Market 1H timeframe on May 8 at 2300 UTC is Bullish cryptocurrency bitcoin btc crypto trading idea report technical analysis @@ -2880,7 +2880,7 @@ ham,2017 Redux Bitcoin Marches Relentlessly Higher Eating Altcoins $6400 Next? v ham,Binance-Backed Crypto Bank Appoints Deutsche Bank Executive as New CEO bitcoin crypto cryptonews cryptomarket altcoin altcoins cryptos cryptocurrency mining btcminer cryptomining btc cryptomarket btc10k spam,XEM0 1$BTC spam,BTC3c7 -ham,""" We need to nip this bitcoin in the bud "" " +ham,We need to nip this bitcoin in the bud ham,Indian Crypto Exchange Coinome Halts Services Citing Regulatory Pressure bitcoin spam,BitexGlobal XBX BitexPay BitexBank EZBITEX ZeusX ETH BTC CryptocurrencyExchange ham,cryptocached shilch_ Must be a coincidence that the level of knowledge you share on a new account is as if you have been living and breathing Bitcoin for years on end Also your posts u/cryptocached show you have beyond a normal level of knowledge Just like contrarian diff --git a/src/tweets/__init__.py b/src/tweets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tweets/tweetFilter.py b/src/tweets/tweetFilter.py index 60e43f3..457cbdf 100644 --- a/src/tweets/tweetFilter.py +++ b/src/tweets/tweetFilter.py @@ -15,6 +15,8 @@ from sklearn.metrics import classification_report, accuracy_score from src.utils.jsonLogger import log +from src.tweets.whitelistedWords import filterOutTweetsWithNoneWhitelistedWords + # Global Metrics HB_NB_Precision = 0 HB_NB_Recall = 0 @@ -22,8 +24,10 @@ HB_NB_F_Score = 0 HB_NB_Accuracy = 0 ## Logic -def processTweet(tweet, gram = 2): - tweet = tweet.lower() #Lower cases +def processTweet(text, gram = 2): + tweet = text.lower() #Lower cases + + tweet = filterOutTweetsWithNoneWhitelistedWords(tweet) words = word_tokenize(tweet) #Tokenise words in text words = [w for w in words if len(w) > 2] diff --git a/src/tweets/whitelistedWords.py b/src/tweets/whitelistedWords.py new file mode 100644 index 0000000..cccdefa --- /dev/null +++ b/src/tweets/whitelistedWords.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +whitelist = [ + "bull", + "bear", + "bullish", + "bearish", + "up", + "down", + "high", + "low", + "higher", + "lower", + "absconded", + "maximalists", + "regulate", + "infamous", + "tradehigher", + "tradelower", + "revival", + "centralized", + "decentralized", + "centralised", + "decentralised", + "decentralization", + "decentralisation", + "centralization", + "centralisation", + "bans", + "hodl", + "ambiguity", + "revolutionize", + "revolutionise", + "consolidation", + "shorts", + "longs", + "long", + "short", + "shorting", + "grow", + "volatile", + "rally", + "rallying", + "noob", + "noobs", + "innovation", + "bottom", + "top", + "topped", + "bottomed", + "upwards", + "downwards", + "invest", + "raging", + "rocketing", + "swing", + "swinging", + "stake", + "whale", + "whales", + "lull", + "moon", + "choppy", + "buy", + "buying", + "sell", + "selling", + "startselling", + "stopselling", + "startbuying", + "stopbuying", + "bitcoin", + "btc", + "eth", + "xmr", + "xrp", + "ripple", + "block", + "reward", + "airdrop", + "drop", + "raise", + "stack", + "stake", + "invest", + "pull", + "push", + "token", + "sale", + "unhappy", + "happy", + "expert", + "novice" + "passed", + "mark", + "decline", + "incline", + "fees", + "crypto", + "wallet", + "price", + "history", + "reached", + "upward", + "downward", + "trading", + "mining", + "defi", + "finance", + "blockchain", + "interest", + "alt", + "alts", + "fiat", + "fiat", + "currency", + "currencies", + "wealth", + "hype", + "hyped", + "achievement", + "platform", + "incremental", + "increment", + "decrement", + "decremental", + "success", + "loss", + "win", + "lose", + "worth", + "strongest", + "weakest", + "strong", + "weak", + "trade", + "popping", + "sucking", + "shard", + "sharding", + "industry" +] + +def filterOutTweetsWithNoneWhitelistedWords(text): + if any(x in text for x in whitelist): + return text + else: + log("Tweet [{}] did not contain any keywords for it to be considered crypto related", 'WARN') \ No newline at end of file