diff --git a/document.aux b/document.aux index 676658c..1400d44 100644 --- a/document.aux +++ b/document.aux @@ -154,85 +154,109 @@ \newlabel{machine}{{}{27}{Machine Learning}{section*.27}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Naive Bayes}{27}{section*.28}} \abx@aux@segm{0}{0}{32} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Random Forest}{28}{section*.29}} \abx@aux@cite{33} \abx@aux@segm{0}{0}{33} +\abx@aux@segm{0}{0}{33} +\abx@aux@segm{0}{0}{33} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Bag Of Words}{28}{section*.29}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{TF-IDF}{28}{section*.30}} +\abx@aux@cite{34} +\abx@aux@segm{0}{0}{34} +\abx@aux@segm{0}{0}{34} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Addictive Smoothing}{29}{section*.31}} +\abx@aux@cite{35} +\abx@aux@segm{0}{0}{35} +\abx@aux@segm{0}{0}{7} \abx@aux@segm{0}{0}{8} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Solution Approach}{29}{section*.30}} -\newlabel{solution}{{}{29}{Solution Approach}{section*.30}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data gathering}{29}{section*.31}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data pre-processing}{30}{section*.32}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Spam Filtering}{30}{section*.33}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Solution Approach}{30}{section*.32}} +\newlabel{solution}{{}{30}{Solution Approach}{section*.32}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data gathering}{30}{section*.33}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data pre-processing}{31}{section*.34}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Spam Filtering}{31}{section*.35}} \abx@aux@segm{0}{0}{12} \abx@aux@segm{0}{0}{12} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Language Detection}{31}{section*.34}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Sentiment Analysis}{31}{section*.35}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Language Detection}{32}{section*.36}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Sentiment Analysis}{32}{section*.37}} \abx@aux@segm{0}{0}{12} \abx@aux@segm{0}{0}{11} \abx@aux@segm{0}{0}{22} \abx@aux@segm{0}{0}{5} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Neural Network}{32}{section*.36}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Neural Network}{33}{section*.38}} \abx@aux@segm{0}{0}{25} \abx@aux@segm{0}{0}{25} -\abx@aux@cite{34} -\abx@aux@segm{0}{0}{34} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Price Forecasting}{34}{section*.38}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Frontend Application}{34}{section*.39}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{With reference to Initial PID}{34}{section*.40}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Solution Summary}{35}{section*.41}} -\newlabel{summary}{{}{35}{Solution Summary}{section*.41}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data flow Overview}{36}{section*.42}} -\newlabel{data-flow}{{}{36}{Data flow Overview}{section*.42}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{System Design}{37}{section*.43}} -\newlabel{Design}{{}{37}{System Design}{section*.43}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Dataflow Designs}{37}{section*.44}} -\abx@aux@segm{0}{0}{12} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Interface Design}{44}{section*.45}} -\abx@aux@cite{35} -\abx@aux@segm{0}{0}{35} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Implementation}{45}{section*.47}} -\newlabel{implementation}{{}{45}{Implementation}{section*.47}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data collection}{45}{section*.48}} -\newlabel{collection}{{}{45}{Data collection}{section*.48}{}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Price Time-Series Historical Data}{45}{section*.49}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {1}Historical price collection and averaging per exchange}{45}{lstlisting.1}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Price Time-Series Live Data}{46}{section*.50}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {2}Extraction of Price from exchanges}{46}{lstlisting.2}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {3}Creation of the unbiased hourly price}{47}{lstlisting.3}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Historical Tweet Collection}{48}{section*.51}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {4}Sample Curl request - data saved to json and python scripted called to process data}{48}{lstlisting.4}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {5}Sift-text python script - used alongside Curl command in Listing 4}{48}{lstlisting.5}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Live Tweet Collection}{50}{section*.52}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {6}Spam filter initialisation and training functions}{50}{lstlisting.6}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {7}Tweepy Streamer setup}{51}{lstlisting.7}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {8}Tweepy Stream: 'on\_data' method}{52}{lstlisting.8}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data pre-processing}{55}{section*.53}} -\newlabel{processing}{{}{55}{Data pre-processing}{section*.53}{}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {9}Basic data filtering and processing function - defined in 'tweet\_collector.py'}{55}{lstlisting.9}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {10}Language detection and filter function \cite {36}}{56}{lstlisting.10}} \abx@aux@cite{36} \abx@aux@segm{0}{0}{36} -\abx@aux@segm{0}{0}{36} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Spam Filtering}{57}{section*.54}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {11}Spam filter training Class}{57}{lstlisting.11}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Sentiment Analysis}{59}{section*.55}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{VADER}{59}{section*.56}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Recurrent Neural Network - LSTM}{59}{section*.57}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Training and Testing Model}{59}{section*.58}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Scoring and Validation}{59}{section*.59}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Future Prediction Forecasting}{59}{section*.60}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Main File 'Main.py'}{59}{section*.61}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Miscellaneous}{59}{section*.62}} -\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {12}keys class - loads API keys for access}{59}{lstlisting.12}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Testing Metrics and Accuracy}{60}{section*.63}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Project Evaluation}{61}{section*.64}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Discussion: Contribution and Reflection}{61}{section*.65}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Limitations}{61}{section*.66}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Conclusion and Future Improvements}{62}{section*.67}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Conclusion}{62}{section*.68}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Future Improvements}{62}{section*.69}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Appendices}{67}{section*.71}} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Appendix A - Project Initiation Document}{67}{section*.72}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Price Forecasting}{35}{section*.40}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Frontend Application}{35}{section*.41}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{With reference to Initial PID}{35}{section*.42}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Solution Summary}{36}{section*.43}} +\newlabel{summary}{{}{36}{Solution Summary}{section*.43}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data flow Overview}{37}{section*.44}} +\newlabel{data-flow}{{}{37}{Data flow Overview}{section*.44}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{System Design}{38}{section*.45}} +\newlabel{Design}{{}{38}{System Design}{section*.45}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Dataflow Designs}{38}{section*.46}} +\abx@aux@segm{0}{0}{12} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Interface Design}{45}{section*.47}} +\abx@aux@cite{37} +\abx@aux@segm{0}{0}{37} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Implementation}{46}{section*.49}} +\newlabel{implementation}{{}{46}{Implementation}{section*.49}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data collection}{46}{section*.50}} +\newlabel{collection}{{}{46}{Data collection}{section*.50}{}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Price Time-Series Historical Data}{46}{section*.51}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {1}Historical price collection and averaging per exchange}{46}{lstlisting.1}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Price Time-Series Live Data}{47}{section*.52}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {2}Extraction of Price from exchanges}{47}{lstlisting.2}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {3}Creation of the unbiased hourly price}{48}{lstlisting.3}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Historical Tweet Collection}{49}{section*.53}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {4}Sample Curl request - data saved to json and python scripted called to process data}{49}{lstlisting.4}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {5}Sift-text python script - used alongside Curl command in Listing 4}{49}{lstlisting.5}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Live Tweet Collection}{51}{section*.54}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {6}Spam filter initialisation and training functions}{51}{lstlisting.6}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {7}Tweepy Streamer setup}{52}{lstlisting.7}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {8}Tweepy Stream: 'on\_data' method}{53}{lstlisting.8}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Data pre-processing}{56}{section*.55}} +\newlabel{processing}{{}{56}{Data pre-processing}{section*.55}{}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {9}Basic data filtering and processing function - defined in 'tweet\_collector.py'}{56}{lstlisting.9}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {10}Language detection and filter function \cite {38}}{57}{lstlisting.10}} +\abx@aux@cite{38} +\abx@aux@segm{0}{0}{38} +\abx@aux@segm{0}{0}{38} +\abx@aux@cite{39} +\abx@aux@segm{0}{0}{39} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {11}pre-processing of data prior to being used by the spam filter}{58}{lstlisting.11}} +\abx@aux@cite{40} +\abx@aux@segm{0}{0}{40} +\abx@aux@segm{0}{0}{40} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Spam Filtering}{60}{section*.56}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {12}Spam filter training Class - \textit {tweet\_collector.py}}{60}{lstlisting.12}} +\abx@aux@cite{41} +\abx@aux@segm{0}{0}{41} +\abx@aux@segm{0}{0}{34} +\abx@aux@cite{42} +\abx@aux@segm{0}{0}{42} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {13}classifer class of spam\_filter.py}{63}{lstlisting.13}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {14}Classify Function of Parent classifier class of spam\_filter.py}{65}{lstlisting.14}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {15}Predict function of parent classifier class of spam\_filter.py}{65}{lstlisting.15}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Sentiment Analysis}{66}{section*.57}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{VADER}{66}{section*.58}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Recurrent Neural Network - LSTM}{66}{section*.59}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Training and Testing Model}{66}{section*.60}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsubsection}{Scoring and Validation}{66}{section*.61}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Future Prediction Forecasting}{66}{section*.62}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Main File 'Main.py'}{66}{section*.63}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Miscellaneous}{66}{section*.64}} +\@writefile{lol}{\defcounter {refsection}{0}\relax }\@writefile{lol}{\contentsline {lstlisting}{\numberline {16}keys class - loads API keys for access}{66}{lstlisting.16}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Testing Metrics and Accuracy}{67}{section*.65}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Project Evaluation}{68}{section*.66}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Discussion: Contribution and Reflection}{68}{section*.67}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Limitations}{68}{section*.68}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Conclusion and Future Improvements}{69}{section*.69}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Conclusion}{69}{section*.70}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Future Improvements}{69}{section*.71}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{Appendices}{74}{section*.73}} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Appendix A - Project Initiation Document}{74}{section*.74}} \abx@aux@refcontextdefaultsdone \abx@aux@defaultrefcontext{0}{1}{none/global//global/global} \abx@aux@defaultrefcontext{0}{2}{none/global//global/global} @@ -270,4 +294,10 @@ \abx@aux@defaultrefcontext{0}{34}{none/global//global/global} \abx@aux@defaultrefcontext{0}{35}{none/global//global/global} \abx@aux@defaultrefcontext{0}{36}{none/global//global/global} -\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Appendix B - Log book}{80}{section*.73}} +\abx@aux@defaultrefcontext{0}{37}{none/global//global/global} +\abx@aux@defaultrefcontext{0}{38}{none/global//global/global} +\abx@aux@defaultrefcontext{0}{39}{none/global//global/global} +\abx@aux@defaultrefcontext{0}{40}{none/global//global/global} +\abx@aux@defaultrefcontext{0}{41}{none/global//global/global} +\abx@aux@defaultrefcontext{0}{42}{none/global//global/global} +\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{Appendix B - Log book}{87}{section*.75}} diff --git a/document.bbl b/document.bbl index 96c6fc2..a3fe150 100644 --- a/document.bbl +++ b/document.bbl @@ -1100,6 +1100,65 @@ \endverb \endentry \entry{33}{inproceedings}{} + \name{author}{1}{}{% + {{hash=98de2b3b288cf4bd291e8937e5e28c49}{% + family={Skymind}, + familyi={S\bibinitperiod}}}% + } + \list{organization}{1}{% + {Skymind}% + } + \strng{namehash}{98de2b3b288cf4bd291e8937e5e28c49} + \strng{fullhash}{98de2b3b288cf4bd291e8937e5e28c49} + \strng{bibnamehash}{98de2b3b288cf4bd291e8937e5e28c49} + \strng{authorbibnamehash}{98de2b3b288cf4bd291e8937e5e28c49} + \strng{authornamehash}{98de2b3b288cf4bd291e8937e5e28c49} + \strng{authorfullhash}{98de2b3b288cf4bd291e8937e5e28c49} + \field{sortinit}{5} + \field{sortinithash}{3c19c3776b658b3558e9e2e4840c01e2} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{booktitle}{A.I Wiki} + \field{title}{A Beginner's Guide to Bag of Words and TF-IDF} + \field{year}{2018} + \verb{urlraw} + \verb https://skymind.ai/wiki/bagofwords-tf-idf + \endverb + \verb{url} + \verb https://skymind.ai/wiki/bagofwords-tf-idf + \endverb + \endentry + \entry{34}{inproceedings}{} + \name{author}{1}{}{% + {{hash=66cc3e81d0437f8fcef4bb3cb3294bc5}{% + family={Karmali}, + familyi={K\bibinitperiod}, + given={Tejan}, + giveni={T\bibinitperiod}}}% + } + \list{organization}{1}{% + {Towards Data Science}% + } + \strng{namehash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \strng{fullhash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \strng{bibnamehash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \strng{authorbibnamehash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \strng{authornamehash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \strng{authorfullhash}{66cc3e81d0437f8fcef4bb3cb3294bc5} + \field{sortinit}{5} + \field{sortinithash}{3c19c3776b658b3558e9e2e4840c01e2} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{title}{Spam Classifier in Python from scratch} + \field{year}{Aug 2, 2017} + \verb{urlraw} + \verb https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73 + \endverb + \verb{url} + \verb https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73 + \endverb + \endentry + \entry{35}{inproceedings}{} \name{author}{1}{}{% {{hash=1e48a059b3f2c3703bea8d54a9e002c9}{% family={Roesslein}, @@ -1113,8 +1172,8 @@ \strng{authorbibnamehash}{1e48a059b3f2c3703bea8d54a9e002c9} \strng{authornamehash}{1e48a059b3f2c3703bea8d54a9e002c9} \strng{authorfullhash}{1e48a059b3f2c3703bea8d54a9e002c9} - \field{sortinit}{5} - \field{sortinithash}{3c19c3776b658b3558e9e2e4840c01e2} + \field{sortinit}{6} + \field{sortinithash}{57e57fb8451e7fcfa45d1e069f6d3136} \field{labelnamesource}{author} \field{labeltitlesource}{title} \field{title}{Tweepy Documentation} @@ -1126,7 +1185,7 @@ \verb http://docs.tweepy.org/en/v3.5.0/ \endverb \endentry - \entry{34}{inproceedings}{} + \entry{36}{inproceedings}{} \name{author}{1}{}{% {{hash=974f50284a1994b00c1f04d211402eb0}{% family={Deoras}, @@ -1143,8 +1202,8 @@ \strng{authorbibnamehash}{974f50284a1994b00c1f04d211402eb0} \strng{authornamehash}{974f50284a1994b00c1f04d211402eb0} \strng{authorfullhash}{974f50284a1994b00c1f04d211402eb0} - \field{sortinit}{6} - \field{sortinithash}{57e57fb8451e7fcfa45d1e069f6d3136} + \field{sortinit}{7} + \field{sortinithash}{c818dd9105a2852444fc9f5e145c294e} \field{labelnamesource}{author} \field{labeltitlesource}{title} \field{title}{Tensorflow Vs. Theano: What Do Researchers Prefer As An Artificial Intelligence Framework} @@ -1156,7 +1215,7 @@ \verb https://www.analyticsindiamag.com/tensorflow-vs-theano-researchers-prefer-artificial-intelligence-framework \endverb \endentry - \entry{35}{inproceedings}{} + \entry{37}{inproceedings}{} \name{author}{1}{}{% {{hash=9fb73450a6ae06fd7652db34b9a3c981}{% family={bitcoincharts}, @@ -1171,8 +1230,8 @@ \strng{authorbibnamehash}{9fb73450a6ae06fd7652db34b9a3c981} \strng{authornamehash}{9fb73450a6ae06fd7652db34b9a3c981} \strng{authorfullhash}{9fb73450a6ae06fd7652db34b9a3c981} - \field{sortinit}{6} - \field{sortinithash}{57e57fb8451e7fcfa45d1e069f6d3136} + \field{sortinit}{7} + \field{sortinithash}{c818dd9105a2852444fc9f5e145c294e} \field{labelnamesource}{author} \verb{urlraw} \verb http://api.bitcoincharts.com/v1/csv/ @@ -1181,7 +1240,7 @@ \verb http://api.bitcoincharts.com/v1/csv/ \endverb \endentry - \entry{36}{inproceedings}{} + \entry{38}{inproceedings}{} \name{author}{1}{}{% {{hash=41b3b5daa9b33f52b08087b54f96f115}{% family={Nolla}, @@ -1198,8 +1257,8 @@ \strng{authorbibnamehash}{41b3b5daa9b33f52b08087b54f96f115} \strng{authornamehash}{41b3b5daa9b33f52b08087b54f96f115} \strng{authorfullhash}{41b3b5daa9b33f52b08087b54f96f115} - \field{sortinit}{6} - \field{sortinithash}{57e57fb8451e7fcfa45d1e069f6d3136} + \field{sortinit}{7} + \field{sortinithash}{c818dd9105a2852444fc9f5e145c294e} \field{labelnamesource}{author} \field{labeltitlesource}{title} \field{title}{Detecting Text Language With Python and NLTK} @@ -1210,6 +1269,124 @@ \verb http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/ \endverb \endentry + \entry{39}{inproceedings}{} + \name{author}{1}{}{% + {{hash=90ecaa1e27a33fbd3ab56223485ab402}{% + family={Cryptography}, + familyi={C\bibinitperiod}, + given={Practical}, + giveni={P\bibinitperiod}}}% + } + \list{organization}{1}{% + {Practical Cryptography}% + } + \strng{namehash}{90ecaa1e27a33fbd3ab56223485ab402} + \strng{fullhash}{90ecaa1e27a33fbd3ab56223485ab402} + \strng{bibnamehash}{90ecaa1e27a33fbd3ab56223485ab402} + \strng{authorbibnamehash}{90ecaa1e27a33fbd3ab56223485ab402} + \strng{authornamehash}{90ecaa1e27a33fbd3ab56223485ab402} + \strng{authorfullhash}{90ecaa1e27a33fbd3ab56223485ab402} + \field{sortinit}{7} + \field{sortinithash}{c818dd9105a2852444fc9f5e145c294e} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{title}{A tutorial on Automatic Language Identification - ngram based} + \verb{urlraw} + \verb http://practicalcryptography.com/miscellaneous/machine-learning/tutorial-automatic-language-identification-ngram-b/ + \endverb + \verb{url} + \verb http://practicalcryptography.com/miscellaneous/machine-learning/tutorial-automatic-language-identification-ngram-b/ + \endverb + \endentry + \entry{40}{inproceedings}{} + \name{author}{1}{}{% + {{hash=b26f0ccdcc8e766bb4785cce3a550346}{% + family={Risueno}, + familyi={R\bibinitperiod}, + given={Tita}, + giveni={T\bibinitperiod}}}% + } + \list{organization}{1}{% + {Bitext}% + } + \strng{namehash}{b26f0ccdcc8e766bb4785cce3a550346} + \strng{fullhash}{b26f0ccdcc8e766bb4785cce3a550346} + \strng{bibnamehash}{b26f0ccdcc8e766bb4785cce3a550346} + \strng{authorbibnamehash}{b26f0ccdcc8e766bb4785cce3a550346} + \strng{authornamehash}{b26f0ccdcc8e766bb4785cce3a550346} + \strng{authorfullhash}{b26f0ccdcc8e766bb4785cce3a550346} + \field{sortinit}{7} + \field{sortinithash}{c818dd9105a2852444fc9f5e145c294e} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{title}{What is the difference between stemming and lemmatization} + \field{year}{Feb 26, 2018} + \verb{urlraw} + \verb https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/ + \endverb + \verb{url} + \verb https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/ + \endverb + \endentry + \entry{41}{inproceedings}{} + \name{author}{1}{}{% + {{hash=f8880822642687df3e50f74258165974}{% + family={developers}, + familyi={d\bibinitperiod}, + prefix={scikit-learn}, + prefixi={s\bibinithyphendelim l\bibinitperiod}}}% + } + \list{organization}{1}{% + {Scikit-Learn}% + } + \strng{namehash}{f8880822642687df3e50f74258165974} + \strng{fullhash}{f8880822642687df3e50f74258165974} + \strng{bibnamehash}{f8880822642687df3e50f74258165974} + \strng{authorbibnamehash}{f8880822642687df3e50f74258165974} + \strng{authornamehash}{f8880822642687df3e50f74258165974} + \strng{authorfullhash}{f8880822642687df3e50f74258165974} + \field{sortinit}{8} + \field{sortinithash}{07edf88d4ea82509b9c4b4d13f41c452} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{title}{Naive Bayes} + \verb{urlraw} + \verb https://scikit-learn.org/stable/modules/naive_bayes.html + \endverb + \verb{url} + \verb https://scikit-learn.org/stable/modules/naive_bayes.html + \endverb + \endentry + \entry{42}{inproceedings}{} + \name{author}{1}{}{% + {{hash=d757410543b5d3323e9db89d373ef2ac}{% + family={tejank10}, + familyi={t\bibinitperiod}, + given={Tejan\bibnamedelimb Karmali\bibnamedelima -}, + giveni={T\bibinitperiod\bibinitdelim K\bibinitperiod\bibinitdelim \bibinithyphendelim \bibinithyphendelim \bibinitperiod}}}% + } + \list{organization}{1}{% + {Github}% + } + \strng{namehash}{d757410543b5d3323e9db89d373ef2ac} + \strng{fullhash}{d757410543b5d3323e9db89d373ef2ac} + \strng{bibnamehash}{d757410543b5d3323e9db89d373ef2ac} + \strng{authorbibnamehash}{d757410543b5d3323e9db89d373ef2ac} + \strng{authornamehash}{d757410543b5d3323e9db89d373ef2ac} + \strng{authorfullhash}{d757410543b5d3323e9db89d373ef2ac} + \field{sortinit}{8} + \field{sortinithash}{07edf88d4ea82509b9c4b4d13f41c452} + \field{labelnamesource}{author} + \field{labeltitlesource}{title} + \field{title}{Spam-or-Ham} + \field{year}{Aug 2, 2017} + \verb{urlraw} + \verb https://github.com/tejank10/Spam-or-Ham + \endverb + \verb{url} + \verb https://github.com/tejank10/Spam-or-Ham + \endverb + \endentry \enddatalist \endrefsection \endinput diff --git a/document.bcf b/document.bcf index 4570ef1..518f2a7 100644 --- a/document.bcf +++ b/document.bcf @@ -1980,20 +1980,32 @@ 32 32 33 - 8 - 12 - 12 - 12 - 11 - 22 - 5 - 25 - 25 - 34 - 12 - 35 - 36 - 36 + 33 + 33 + 34 + 34 + 35 + 7 + 8 + 12 + 12 + 12 + 11 + 22 + 5 + 25 + 25 + 36 + 12 + 37 + 38 + 38 + 39 + 40 + 40 + 41 + 34 + 42 * diff --git a/document.blg b/document.blg index ab4eb46..f3fdf21 100644 --- a/document.blg +++ b/document.blg @@ -1,20 +1,23 @@ [0] Config.pm:302> INFO - This is Biber 2.9 [0] Config.pm:305> INFO - Logfile is 'document.blg' -[19] biber:313> INFO - === Thu Apr 25, 2019, 21:12:18 -[33] Biber.pm:371> INFO - Reading 'document.bcf' -[78] Biber.pm:854> INFO - Using all citekeys in bib section 0 -[88] Biber.pm:3981> INFO - Processing section 0 -[95] Biber.pm:4154> INFO - Looking for bibtex format file 'report.bib' for section 0 -[96] bibtex.pm:1468> INFO - LaTeX decoding ... -[127] bibtex.pm:1294> INFO - Found BibTeX data source 'report.bib' -[184] Utils.pm:169> WARN - Name "Mairal, J., Ponce, J., Sapiro, G., Zisserman, A." has too many commas: skipping name -[196] Utils.pm:169> WARN - year field 'Mar 13, 2016' in entry '23' is not an integer - this will probably not sort properly. -[238] Utils.pm:169> WARN - BibTeX subsystem: warning: comma(s) at end of name (removing) -[238] Utils.pm:169> WARN - BibTeX subsystem: author, warning: comma(s) at end of name (removing) -[257] UCollate.pm:68> INFO - Overriding locale 'en-US' defaults 'variable = shifted' with 'variable = non-ignorable' -[257] UCollate.pm:68> INFO - Overriding locale 'en-US' defaults 'normalization = NFD' with 'normalization = prenormalized' -[257] Biber.pm:3809> INFO - Sorting list 'none/global//global/global' of type 'entry' with template 'none' and locale 'en-US' -[257] Biber.pm:3815> INFO - No sort tailoring available for locale 'en-US' -[275] bbl.pm:617> INFO - Writing 'document.bbl' with encoding 'ascii' -[287] bbl.pm:720> INFO - Output to document.bbl -[287] Biber.pm:110> INFO - WARNINGS: 4 +[19] biber:313> INFO - === Fri Apr 26, 2019, 16:35:53 +[34] Biber.pm:371> INFO - Reading 'document.bcf' +[81] Biber.pm:854> INFO - Using all citekeys in bib section 0 +[91] Biber.pm:3981> INFO - Processing section 0 +[99] Biber.pm:4154> INFO - Looking for bibtex format file 'report.bib' for section 0 +[100] bibtex.pm:1468> INFO - LaTeX decoding ... +[135] bibtex.pm:1294> INFO - Found BibTeX data source 'report.bib' +[143] Utils.pm:169> WARN - year field 'Mar 13, 2016' in entry '23' is not an integer - this will probably not sort properly. +[162] Utils.pm:169> WARN - year field 'Aug 2, 2017' in entry '42' is not an integer - this will probably not sort properly. +[179] Utils.pm:169> WARN - Name "Mairal, J., Ponce, J., Sapiro, G., Zisserman, A." has too many commas: skipping name +[183] Utils.pm:169> WARN - year field 'Aug 2, 2017' in entry '34' is not an integer - this will probably not sort properly. +[243] Utils.pm:169> WARN - year field 'Feb 26, 2018' in entry '40' is not an integer - this will probably not sort properly. +[262] Utils.pm:169> WARN - BibTeX subsystem: warning: comma(s) at end of name (removing) +[262] Utils.pm:169> WARN - BibTeX subsystem: author, warning: comma(s) at end of name (removing) +[283] UCollate.pm:68> INFO - Overriding locale 'en-US' defaults 'variable = shifted' with 'variable = non-ignorable' +[283] UCollate.pm:68> INFO - Overriding locale 'en-US' defaults 'normalization = NFD' with 'normalization = prenormalized' +[283] Biber.pm:3809> INFO - Sorting list 'none/global//global/global' of type 'entry' with template 'none' and locale 'en-US' +[283] Biber.pm:3815> INFO - No sort tailoring available for locale 'en-US' +[304] bbl.pm:617> INFO - Writing 'document.bbl' with encoding 'ascii' +[318] bbl.pm:720> INFO - Output to document.bbl +[318] Biber.pm:110> INFO - WARNINGS: 7 diff --git a/document.log b/document.log index cf0d8ca..a3e722c 100644 --- a/document.log +++ b/document.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/Debian) (preloaded format=pdflatex 2018.10.16) 25 APR 2019 21:27 +This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/Debian) (preloaded format=pdflatex 2018.10.16) 26 APR 2019 17:25 entering extended mode restricted \write18 enabled. %&-line parsing enabled. @@ -972,7 +972,7 @@ LaTeX Info: Redefining \nameref on input line 91. \@outlinefile=\write5 \openout5 = `document.out'. - + File: images/reading_logo.png Graphic file (type png) Package pdftex.def Info: images/reading_logo.png used on input line 94. @@ -1063,19 +1063,19 @@ Missing character: There is no Missing character: There is no € in font cmr12! Missing character: There is no ™ in font cmr12! [20] - + File: images/perceptron.png Graphic file (type png) Package pdftex.def Info: images/perceptron.png used on input line 338. (pdftex.def) Requested size: 284.52713pt x 170.72142pt. [21 <./images/perceptron.png>] - + File: images/rnn_ffn.png Graphic file (type png) Package pdftex.def Info: images/rnn_ffn.png used on input line 358. (pdftex.def) Requested size: 426.80307pt x 170.72112pt. [22 <./images/rnn_ffn.png>] - + File: images/lstm.png Graphic file (type png) Package pdftex.def Info: images/lstm.png used on input line 376. @@ -1088,197 +1088,225 @@ Missing character: There is no Missing character: There is no â in font cmr12! Missing character: There is no € in font cmr12! Missing character: There is no ™ in font cmr12! - [26] [27] [28] -Underfull \hbox (badness 10000) in paragraph at lines 488--490 + [26] [27] +Underfull \hbox (badness 10000) in paragraph at lines 487--490 + + [] + +[28] +Underfull \hbox (badness 10000) in paragraph at lines 504--506 [] -Underfull \hbox (badness 10000) in paragraph at lines 492--494 +Overfull \hbox (9.85295pt too wide) detected at line 509 +\OML/cmm/m/it/12 P\OT1/cmr/m/n/12 (\OML/cmm/m/it/12 w\OMS/cmsy/m/n/12 j\OML/cmm +/m/it/12 spam\OT1/cmr/m/n/12 ) = [] + [] + +[29] +Underfull \hbox (badness 10000) in paragraph at lines 516--518 [] -Underfull \hbox (badness 10000) in paragraph at lines 495--499 +Underfull \hbox (badness 10000) in paragraph at lines 520--522 [] -Underfull \hbox (badness 10000) in paragraph at lines 500--502 +Underfull \hbox (badness 10000) in paragraph at lines 523--527 [] -Underfull \hbox (badness 10000) in paragraph at lines 503--507 +Underfull \hbox (badness 10000) in paragraph at lines 528--530 [] -[29] [30] + +Underfull \hbox (badness 10000) in paragraph at lines 531--535 + + [] + +[30] [31] LaTeX Font Info: Font shape `OMS/cmr/m/it' in size <12> not available -(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 520. - [31] [32] +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 548. + [32] [33] Missing character: There is no â in font cmr12! Missing character: There is no € in font cmr12! Missing character: There is no ™ in font cmr12! - [33] [34] [35] - + [34] [35] [36] + File: images/Generic_Flow.png Graphic file (type png) -Package pdftex.def Info: images/Generic_Flow.png used on input line 609. +Package pdftex.def Info: images/Generic_Flow.png used on input line 637. (pdftex.def) Requested size: 512.13474pt x 227.62009pt. -Overfull \hbox (71.28728pt too wide) in paragraph at lines 609--610 +Overfull \hbox (71.28728pt too wide) in paragraph at lines 637--638 [][] [] -[36 <./images/Generic_Flow.png (PNG copy)>] -Underfull \hbox (badness 10000) in paragraph at lines 620--622 +[37 <./images/Generic_Flow.png (PNG copy)>] +Underfull \hbox (badness 10000) in paragraph at lines 648--650 [] - + File: images/Dataflow.png Graphic file (type png) -Package pdftex.def Info: images/Dataflow.png used on input line 625. +Package pdftex.def Info: images/Dataflow.png used on input line 653. (pdftex.def) Requested size: 512.09683pt x 227.62125pt. -Overfull \hbox (71.24937pt too wide) in paragraph at lines 625--627 +Overfull \hbox (71.24937pt too wide) in paragraph at lines 653--655 [] [] -[37 <./images/Dataflow.png (PNG copy)>] - +[38 <./images/Dataflow.png (PNG copy)>] + File: images/Data_Collector.png Graphic file (type png) -Package pdftex.def Info: images/Data_Collector.png used on input line 633. +Package pdftex.def Info: images/Data_Collector.png used on input line 661. (pdftex.def) Requested size: 426.78574pt x 227.61746pt. - [38 <./images/Data_Collector.png (PNG copy)>] - + [39 <./images/Data_Collector.png (PNG copy)>] + File: images/Analysis_Engine.png Graphic file (type png) -Package pdftex.def Info: images/Analysis_Engine.png used on input line 648. +Package pdftex.def Info: images/Analysis_Engine.png used on input line 676. (pdftex.def) Requested size: 483.67276pt x 227.62561pt. -Overfull \hbox (42.8253pt too wide) in paragraph at lines 648--650 +Overfull \hbox (42.8253pt too wide) in paragraph at lines 676--678 [] [] -[39 <./images/Analysis_Engine.png (PNG copy)>] - +[40 <./images/Analysis_Engine.png (PNG copy)>] [41] + File: images/Neural_Network.png Graphic file (type png) -Package pdftex.def Info: images/Neural_Network.png used on input line 664. +Package pdftex.def Info: images/Neural_Network.png used on input line 694. (pdftex.def) Requested size: 483.6893pt x 341.42757pt. -Overfull \hbox (42.84184pt too wide) in paragraph at lines 664--666 +Overfull \hbox (42.84184pt too wide) in paragraph at lines 694--696 [] [] -[40] [41 <./images/Neural_Network.png (PNG copy)>] - +[42 <./images/Neural_Network.png (PNG copy)>] + File: images/Future_Predictions.png Graphic file (type png) -Package pdftex.def Info: images/Future_Predictions.png used on input line 678. +Package pdftex.def Info: images/Future_Predictions.png used on input line 708. (pdftex.def) Requested size: 512.1362pt x 227.62119pt. -Overfull \hbox (71.28874pt too wide) in paragraph at lines 678--680 +Overfull \hbox (71.28874pt too wide) in paragraph at lines 708--710 [] [] -[42 <./images/Future_Predictions.png (PNG copy)>] - +[43 <./images/Future_Predictions.png (PNG copy)>] + File: images/Frontend_Application.png Graphic file (type png) -Package pdftex.def Info: images/Frontend_Application.png used on input line 69 +Package pdftex.def Info: images/Frontend_Application.png used on input line 72 1. (pdftex.def) Requested size: 284.52162pt x 256.07664pt. - [43 <./images/Frontend_Application.png (PNG copy)>] - + [44 <./images/Frontend_Application.png (PNG copy)>] + File: images/interface_design.png Graphic file (type png) -Package pdftex.def Info: images/interface_design.png used on input line 710. +Package pdftex.def Info: images/interface_design.png used on input line 740. (pdftex.def) Requested size: 227.61479pt x 369.88063pt. - [44 <./images/interface_design.png>] -Underfull \hbox (badness 10000) in paragraph at lines 721--723 + [45 <./images/interface_design.png>] +Underfull \hbox (badness 10000) in paragraph at lines 751--753 [] (/usr/share/texlive/texmf-dist/tex/latex/listings/lstlang1.sty File: lstlang1.sty 2015/06/04 1.6 listings language file -) [45] [46] -[47] +) [46] [47] +[48] LaTeX Font Info: Font shape `OMS/cmr/m/n' in size <10> not available -(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 847. - [48] [49] -Underfull \hbox (badness 10000) in paragraph at lines 934--936 - - [] - -[50] -Underfull \hbox (badness 10000) in paragraph at lines 971--973 +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 877. + [49] [50] +Underfull \hbox (badness 10000) in paragraph at lines 964--966 [] [51] -LaTeX Font Info: Try loading font information for OML+cmr on input line 984. +Underfull \hbox (badness 10000) in paragraph at lines 1001--1003 + [] + +[52] +LaTeX Font Info: Try loading font information for OML+cmr on input line 1014 +. (/usr/share/texlive/texmf-dist/tex/latex/base/omlcmr.fd File: omlcmr.fd 2014/09/29 v2.5h Standard LaTeX font definitions ) LaTeX Font Info: Font shape `OML/cmr/m/n' in size <10> not available -(Font) Font shape `OML/cmm/m/it' tried instead on input line 984. - [52] [53] -[54] -Underfull \hbox (badness 10000) in paragraph at lines 1075--1077 +(Font) Font shape `OML/cmm/m/it' tried instead on input line 1014. + + [53] [54] +[55] +Underfull \hbox (badness 10000) in paragraph at lines 1105--1107 [] Missing character: There is no  in font cmr10! Missing character: There is no £ in font cmr10! -Underfull \hbox (badness 10000) in paragraph at lines 1114--1117 +Underfull \hbox (badness 10000) in paragraph at lines 1144--1147 [] -[55] [56] [57] [58] [59] [60] [61] [62] -Overfull \hbox (5.27716pt too wide) in paragraph at lines 1297--1297 +[56] [57] +Underfull \hbox (badness 10000) in paragraph at lines 1193--1195 + + [] + +[58] [59] [60] [61] [62] [63] [64] [65] [66] [67] [68] [69] +Overfull \hbox (5.27716pt too wide) in paragraph at lines 1509--1509 \OT1/cmr/m/it/12 ence on sig-nal pro-cess-ing, com-mu-ni-ca-tion, power and em- bed-ded sys-tem (SCOPES)\OT1/cmr/m/n/12 , [] -Overfull \hbox (42.7786pt too wide) in paragraph at lines 1297--1297 +Overfull \hbox (42.7786pt too wide) in paragraph at lines 1509--1509 \OT1/cmr/m/n/12 works,'' To-wards Data Sci-ence, 2018. [On-line]. Avail-able: [ ]$\OT1/cmtt/m/n/12 https : / / towardsdatascience . [] -[63] -Overfull \hbox (86.07425pt too wide) in paragraph at lines 1297--1297 +[70] +Overfull \hbox (86.07425pt too wide) in paragraph at lines 1509--1509 \OT1/cmr/m/n/12 works,'' Ma-chine Larn-ing Mas-tery, 2017. [On-line]. Avail-abl e: []$\OT1/cmtt/m/n/12 https : / / machinelearningmastery . [] -Overfull \hbox (30.84552pt too wide) in paragraph at lines 1297--1297 +Overfull \hbox (30.84552pt too wide) in paragraph at lines 1509--1509 \OT1/cmr/m/n/12 lem,'' Su-per Data Sci-ence, 2018. [On-line]. Avail-able: []$\O T1/cmtt/m/n/12 https : / / www . superdatascience . [] -[64] -Overfull \hbox (9.16136pt too wide) in paragraph at lines 1297--1297 +[71] +Overfull \hbox (9.16136pt too wide) in paragraph at lines 1509--1509 \OT1/cmr/m/n/12 2019. [On-line]. Avail-able: []$\OT1/cmtt/m/n/12 https : / / me dium . com / datadriveninvestor / overview -[] [] -[65] [66] +[72] +Overfull \hbox (2.93918pt too wide) in paragraph at lines 1509--1509 +[]\OT1/cmr/m/n/12 P. Cryp-tog-ra-phy, ``A tu-to-rial on au-to-matic lan-guage i +den-ti-fi-ca-tion - ngram based,'' + [] + +[73] pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf used on input line 1302. +Package pdftex.def Info: PID.pdf used on input line 1514. (pdftex.def) Requested size: 597.551pt x 845.07512pt. @@ -1286,7 +1314,7 @@ pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf used on input line 1302. +Package pdftex.def Info: PID.pdf used on input line 1514. (pdftex.def) Requested size: 597.551pt x 845.07512pt. @@ -1296,237 +1324,237 @@ rsion <1.7>, but at most version <1.5> allowed pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page1 used on input line 1302. +Package pdftex.def Info: PID.pdf , page1 used on input line 1514. (pdftex.def) Requested size: 597.551pt x 845.07512pt. File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page1 used on input line 1302. +Package pdftex.def Info: PID.pdf , page1 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[67] +[74] File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page1 used on input line 1302. +Package pdftex.def Info: PID.pdf , page1 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page1 used on input line 1302. +Package pdftex.def Info: PID.pdf , page1 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page1 used on input line 1302. +Package pdftex.def Info: PID.pdf , page1 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. - [68 <./PID.pdf>] + [75 <./PID.pdf>] pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page2 used on input line 1302. +Package pdftex.def Info: PID.pdf , page2 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page2 used on input line 1302. +Package pdftex.def Info: PID.pdf , page2 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) -Package pdftex.def Info: PID.pdf , page2 used on input line 1302. +Package pdftex.def Info: PID.pdf , page2 used on input line 1514. (pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[69 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page3 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page3 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page3 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[70 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page4 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page4 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page4 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[71 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page5 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page5 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page5 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[72 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page6 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page6 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page6 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[73 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page7 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page7 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page7 used on input line 1302. -(pdftex.def) Requested size: 562.1644pt x 795.0303pt. -[74 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page8 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page8 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page8 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. -[75 <./PID.pdf>] - -pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve -rsion <1.7>, but at most version <1.5> allowed - -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page9 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page9 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. -File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page9 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. [76 <./PID.pdf>] pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page10 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page3 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page10 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page3 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page10 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page3 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. [77 <./PID.pdf>] pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page11 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page4 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page11 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page4 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page11 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page4 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. [78 <./PID.pdf>] pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve rsion <1.7>, but at most version <1.5> allowed - + File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page12 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page5 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page12 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page5 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. File: PID.pdf Graphic file (type pdf) - -Package pdftex.def Info: PID.pdf , page12 used on input line 1302. -(pdftex.def) Requested size: 795.0303pt x 562.1644pt. + +Package pdftex.def Info: PID.pdf , page5 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. [79 <./PID.pdf>] -Package atveryend Info: Empty hook `BeforeClearDocument' on input line 1306. - [80] -Package atveryend Info: Empty hook `AfterLastShipout' on input line 1306. + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page6 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page6 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page6 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +[80 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page7 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page7 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page7 used on input line 1514. +(pdftex.def) Requested size: 562.1644pt x 795.0303pt. +[81 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page8 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page8 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page8 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +[82 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page9 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page9 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page9 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +[83 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page10 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page10 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page10 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +[84 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page11 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page11 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page11 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +[85 <./PID.pdf>] + +pdfTeX warning: /usr/bin/pdflatex (file ./PID.pdf): PDF inclusion: found PDF ve +rsion <1.7>, but at most version <1.5> allowed + +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page12 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page12 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +File: PID.pdf Graphic file (type pdf) + +Package pdftex.def Info: PID.pdf , page12 used on input line 1514. +(pdftex.def) Requested size: 795.0303pt x 562.1644pt. +[86 <./PID.pdf>] +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 1518. + [87] +Package atveryend Info: Empty hook `AfterLastShipout' on input line 1518. (./document.aux) -Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 1306. -Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 1306. +Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 1518. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 1518. Package rerunfilecheck Info: File `document.out' has not changed. -(rerunfilecheck) Checksum: 283D22108CCCB057050FE149F93F8AC8;10716. +(rerunfilecheck) Checksum: DF36EDBB0B7E9D72F7D48A764834A921;10956. Package logreq Info: Writing requests to 'document.run.xml'. \openout1 = `document.run.xml'. ) Here is how much of TeX's memory you used: - 24631 strings out of 492982 - 386648 string characters out of 6134895 - 1047130 words of memory out of 5000000 - 27213 multiletter control sequences out of 15000+600000 + 24831 strings out of 492982 + 389633 string characters out of 6134895 + 1055130 words of memory out of 5000000 + 27264 multiletter control sequences out of 15000+600000 13923 words of font info for 54 fonts, out of 8000000 for 9000 1141 hyphenation exceptions out of 8191 - 45i,18n,78p,2008b,1819s stack positions out of 5000i,500n,10000p,200000b,80000s + 45i,18n,78p,2008b,1817s stack positions out of 5000i,500n,10000p,200000b,80000s -Output written on document.pdf (80 pages, 1495523 bytes). +Output written on document.pdf (87 pages, 1528100 bytes). PDF statistics: - 1611 PDF objects out of 1728 (max. 8388607) - 1460 compressed objects within 15 object streams - 633 named destinations out of 1000 (max. 500000) - 688 words of extra memory for PDF output out of 10000 (max. 10000000) + 1845 PDF objects out of 2073 (max. 8388607) + 1685 compressed objects within 17 object streams + 782 named destinations out of 1000 (max. 500000) + 704 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/document.out b/document.out index bba1423..644ffa2 100644 --- a/document.out +++ b/document.out @@ -25,45 +25,47 @@ \BOOKMARK [3][-]{section*.26}{\376\377\000O\000p\000t\000i\000m\000i\000s\000e\000r\000s}{section*.22}% 25 \BOOKMARK [2][-]{section*.27}{\376\377\000M\000a\000c\000h\000i\000n\000e\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section*.13}% 26 \BOOKMARK [3][-]{section*.28}{\376\377\000N\000a\000i\000v\000e\000\040\000B\000a\000y\000e\000s}{section*.27}% 27 -\BOOKMARK [2][-]{section*.29}{\376\377\000R\000a\000n\000d\000o\000m\000\040\000F\000o\000r\000e\000s\000t}{section*.13}% 28 -\BOOKMARK [1][-]{section*.30}{\376\377\000S\000o\000l\000u\000t\000i\000o\000n\000\040\000A\000p\000p\000r\000o\000a\000c\000h}{}% 29 -\BOOKMARK [2][-]{section*.31}{\376\377\000D\000a\000t\000a\000\040\000g\000a\000t\000h\000e\000r\000i\000n\000g}{section*.30}% 30 -\BOOKMARK [2][-]{section*.32}{\376\377\000D\000a\000t\000a\000\040\000p\000r\000e\000-\000p\000r\000o\000c\000e\000s\000s\000i\000n\000g}{section*.30}% 31 -\BOOKMARK [2][-]{section*.33}{\376\377\000S\000p\000a\000m\000\040\000F\000i\000l\000t\000e\000r\000i\000n\000g}{section*.30}% 32 -\BOOKMARK [2][-]{section*.34}{\376\377\000L\000a\000n\000g\000u\000a\000g\000e\000\040\000D\000e\000t\000e\000c\000t\000i\000o\000n}{section*.30}% 33 -\BOOKMARK [2][-]{section*.35}{\376\377\000S\000e\000n\000t\000i\000m\000e\000n\000t\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section*.30}% 34 -\BOOKMARK [2][-]{section*.36}{\376\377\000N\000e\000u\000r\000a\000l\000\040\000N\000e\000t\000w\000o\000r\000k}{section*.30}% 35 -\BOOKMARK [2][-]{section*.38}{\376\377\000P\000r\000i\000c\000e\000\040\000F\000o\000r\000e\000c\000a\000s\000t\000i\000n\000g}{section*.30}% 36 -\BOOKMARK [2][-]{section*.39}{\376\377\000F\000r\000o\000n\000t\000e\000n\000d\000\040\000A\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n}{section*.30}% 37 -\BOOKMARK [2][-]{section*.40}{\376\377\000W\000i\000t\000h\000\040\000r\000e\000f\000e\000r\000e\000n\000c\000e\000\040\000t\000o\000\040\000I\000n\000i\000t\000i\000a\000l\000\040\000P\000I\000D}{section*.30}% 38 -\BOOKMARK [2][-]{section*.41}{\376\377\000S\000o\000l\000u\000t\000i\000o\000n\000\040\000S\000u\000m\000m\000a\000r\000y}{section*.30}% 39 -\BOOKMARK [2][-]{section*.42}{\376\377\000D\000a\000t\000a\000\040\000f\000l\000o\000w\000\040\000O\000v\000e\000r\000v\000i\000e\000w}{section*.30}% 40 -\BOOKMARK [1][-]{section*.43}{\376\377\000S\000y\000s\000t\000e\000m\000\040\000D\000e\000s\000i\000g\000n}{}% 41 -\BOOKMARK [2][-]{section*.44}{\376\377\000D\000a\000t\000a\000f\000l\000o\000w\000\040\000D\000e\000s\000i\000g\000n\000s}{section*.43}% 42 -\BOOKMARK [2][-]{section*.45}{\376\377\000I\000n\000t\000e\000r\000f\000a\000c\000e\000\040\000D\000e\000s\000i\000g\000n}{section*.43}% 43 -\BOOKMARK [1][-]{section*.47}{\376\377\000I\000m\000p\000l\000e\000m\000e\000n\000t\000a\000t\000i\000o\000n}{}% 44 -\BOOKMARK [2][-]{section*.48}{\376\377\000D\000a\000t\000a\000\040\000c\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.47}% 45 -\BOOKMARK [3][-]{section*.49}{\376\377\000P\000r\000i\000c\000e\000\040\000T\000i\000m\000e\000-\000S\000e\000r\000i\000e\000s\000\040\000H\000i\000s\000t\000o\000r\000i\000c\000a\000l\000\040\000D\000a\000t\000a}{section*.48}% 46 -\BOOKMARK [3][-]{section*.50}{\376\377\000P\000r\000i\000c\000e\000\040\000T\000i\000m\000e\000-\000S\000e\000r\000i\000e\000s\000\040\000L\000i\000v\000e\000\040\000D\000a\000t\000a}{section*.48}% 47 -\BOOKMARK [3][-]{section*.51}{\376\377\000H\000i\000s\000t\000o\000r\000i\000c\000a\000l\000\040\000T\000w\000e\000e\000t\000\040\000C\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.48}% 48 -\BOOKMARK [3][-]{section*.52}{\376\377\000L\000i\000v\000e\000\040\000T\000w\000e\000e\000t\000\040\000C\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.48}% 49 -\BOOKMARK [2][-]{section*.53}{\376\377\000D\000a\000t\000a\000\040\000p\000r\000e\000-\000p\000r\000o\000c\000e\000s\000s\000i\000n\000g}{section*.47}% 50 -\BOOKMARK [3][-]{section*.54}{\376\377\000S\000p\000a\000m\000\040\000F\000i\000l\000t\000e\000r\000i\000n\000g}{section*.53}% 51 -\BOOKMARK [2][-]{section*.55}{\376\377\000S\000e\000n\000t\000i\000m\000e\000n\000t\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section*.47}% 52 -\BOOKMARK [3][-]{section*.56}{\376\377\000V\000A\000D\000E\000R}{section*.55}% 53 -\BOOKMARK [2][-]{section*.57}{\376\377\000R\000e\000c\000u\000r\000r\000e\000n\000t\000\040\000N\000e\000u\000r\000a\000l\000\040\000N\000e\000t\000w\000o\000r\000k\000\040\000-\000\040\000L\000S\000T\000M}{section*.47}% 54 -\BOOKMARK [3][-]{section*.58}{\376\377\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000T\000e\000s\000t\000i\000n\000g\000\040\000M\000o\000d\000e\000l}{section*.57}% 55 -\BOOKMARK [3][-]{section*.59}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000a\000n\000d\000\040\000V\000a\000l\000i\000d\000a\000t\000i\000o\000n}{section*.57}% 56 -\BOOKMARK [2][-]{section*.60}{\376\377\000F\000u\000t\000u\000r\000e\000\040\000P\000r\000e\000d\000i\000c\000t\000i\000o\000n\000\040\000F\000o\000r\000e\000c\000a\000s\000t\000i\000n\000g}{section*.47}% 57 -\BOOKMARK [2][-]{section*.61}{\376\377\000M\000a\000i\000n\000\040\000F\000i\000l\000e\000\040\000'\000M\000a\000i\000n\000.\000p\000y\000'}{section*.47}% 58 -\BOOKMARK [2][-]{section*.62}{\376\377\000M\000i\000s\000c\000e\000l\000l\000a\000n\000e\000o\000u\000s}{section*.47}% 59 -\BOOKMARK [1][-]{section*.63}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000M\000e\000t\000r\000i\000c\000s\000\040\000a\000n\000d\000\040\000A\000c\000c\000u\000r\000a\000c\000y}{}% 60 -\BOOKMARK [1][-]{section*.64}{\376\377\000P\000r\000o\000j\000e\000c\000t\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n}{}% 61 -\BOOKMARK [1][-]{section*.65}{\376\377\000D\000i\000s\000c\000u\000s\000s\000i\000o\000n\000:\000\040\000C\000o\000n\000t\000r\000i\000b\000u\000t\000i\000o\000n\000\040\000a\000n\000d\000\040\000R\000e\000f\000l\000e\000c\000t\000i\000o\000n}{}% 62 -\BOOKMARK [2][-]{section*.66}{\376\377\000L\000i\000m\000i\000t\000a\000t\000i\000o\000n\000s}{section*.65}% 63 -\BOOKMARK [1][-]{section*.67}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000I\000m\000p\000r\000o\000v\000e\000m\000e\000n\000t\000s}{}% 64 -\BOOKMARK [2][-]{section*.68}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n}{section*.67}% 65 -\BOOKMARK [2][-]{section*.69}{\376\377\000F\000u\000t\000u\000r\000e\000\040\000I\000m\000p\000r\000o\000v\000e\000m\000e\000n\000t\000s}{section*.67}% 66 -\BOOKMARK [1][-]{section*.71}{\376\377\000A\000p\000p\000e\000n\000d\000i\000c\000e\000s}{}% 67 -\BOOKMARK [2][-]{section*.72}{\376\377\000A\000p\000p\000e\000n\000d\000i\000x\000\040\000A\000\040\000-\000\040\000P\000r\000o\000j\000e\000c\000t\000\040\000I\000n\000i\000t\000i\000a\000t\000i\000o\000n\000\040\000D\000o\000c\000u\000m\000e\000n\000t}{section*.71}% 68 -\BOOKMARK [2][-]{section*.73}{\376\377\000A\000p\000p\000e\000n\000d\000i\000x\000\040\000B\000\040\000-\000\040\000L\000o\000g\000\040\000b\000o\000o\000k}{section*.71}% 69 +\BOOKMARK [2][-]{section*.29}{\376\377\000B\000a\000g\000\040\000O\000f\000\040\000W\000o\000r\000d\000s}{section*.13}% 28 +\BOOKMARK [2][-]{section*.30}{\376\377\000T\000F\000-\000I\000D\000F}{section*.13}% 29 +\BOOKMARK [2][-]{section*.31}{\376\377\000A\000d\000d\000i\000c\000t\000i\000v\000e\000\040\000S\000m\000o\000o\000t\000h\000i\000n\000g}{section*.13}% 30 +\BOOKMARK [1][-]{section*.32}{\376\377\000S\000o\000l\000u\000t\000i\000o\000n\000\040\000A\000p\000p\000r\000o\000a\000c\000h}{}% 31 +\BOOKMARK [2][-]{section*.33}{\376\377\000D\000a\000t\000a\000\040\000g\000a\000t\000h\000e\000r\000i\000n\000g}{section*.32}% 32 +\BOOKMARK [2][-]{section*.34}{\376\377\000D\000a\000t\000a\000\040\000p\000r\000e\000-\000p\000r\000o\000c\000e\000s\000s\000i\000n\000g}{section*.32}% 33 +\BOOKMARK [2][-]{section*.35}{\376\377\000S\000p\000a\000m\000\040\000F\000i\000l\000t\000e\000r\000i\000n\000g}{section*.32}% 34 +\BOOKMARK [2][-]{section*.36}{\376\377\000L\000a\000n\000g\000u\000a\000g\000e\000\040\000D\000e\000t\000e\000c\000t\000i\000o\000n}{section*.32}% 35 +\BOOKMARK [2][-]{section*.37}{\376\377\000S\000e\000n\000t\000i\000m\000e\000n\000t\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section*.32}% 36 +\BOOKMARK [2][-]{section*.38}{\376\377\000N\000e\000u\000r\000a\000l\000\040\000N\000e\000t\000w\000o\000r\000k}{section*.32}% 37 +\BOOKMARK [2][-]{section*.40}{\376\377\000P\000r\000i\000c\000e\000\040\000F\000o\000r\000e\000c\000a\000s\000t\000i\000n\000g}{section*.32}% 38 +\BOOKMARK [2][-]{section*.41}{\376\377\000F\000r\000o\000n\000t\000e\000n\000d\000\040\000A\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n}{section*.32}% 39 +\BOOKMARK [2][-]{section*.42}{\376\377\000W\000i\000t\000h\000\040\000r\000e\000f\000e\000r\000e\000n\000c\000e\000\040\000t\000o\000\040\000I\000n\000i\000t\000i\000a\000l\000\040\000P\000I\000D}{section*.32}% 40 +\BOOKMARK [2][-]{section*.43}{\376\377\000S\000o\000l\000u\000t\000i\000o\000n\000\040\000S\000u\000m\000m\000a\000r\000y}{section*.32}% 41 +\BOOKMARK [2][-]{section*.44}{\376\377\000D\000a\000t\000a\000\040\000f\000l\000o\000w\000\040\000O\000v\000e\000r\000v\000i\000e\000w}{section*.32}% 42 +\BOOKMARK [1][-]{section*.45}{\376\377\000S\000y\000s\000t\000e\000m\000\040\000D\000e\000s\000i\000g\000n}{}% 43 +\BOOKMARK [2][-]{section*.46}{\376\377\000D\000a\000t\000a\000f\000l\000o\000w\000\040\000D\000e\000s\000i\000g\000n\000s}{section*.45}% 44 +\BOOKMARK [2][-]{section*.47}{\376\377\000I\000n\000t\000e\000r\000f\000a\000c\000e\000\040\000D\000e\000s\000i\000g\000n}{section*.45}% 45 +\BOOKMARK [1][-]{section*.49}{\376\377\000I\000m\000p\000l\000e\000m\000e\000n\000t\000a\000t\000i\000o\000n}{}% 46 +\BOOKMARK [2][-]{section*.50}{\376\377\000D\000a\000t\000a\000\040\000c\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.49}% 47 +\BOOKMARK [3][-]{section*.51}{\376\377\000P\000r\000i\000c\000e\000\040\000T\000i\000m\000e\000-\000S\000e\000r\000i\000e\000s\000\040\000H\000i\000s\000t\000o\000r\000i\000c\000a\000l\000\040\000D\000a\000t\000a}{section*.50}% 48 +\BOOKMARK [3][-]{section*.52}{\376\377\000P\000r\000i\000c\000e\000\040\000T\000i\000m\000e\000-\000S\000e\000r\000i\000e\000s\000\040\000L\000i\000v\000e\000\040\000D\000a\000t\000a}{section*.50}% 49 +\BOOKMARK [3][-]{section*.53}{\376\377\000H\000i\000s\000t\000o\000r\000i\000c\000a\000l\000\040\000T\000w\000e\000e\000t\000\040\000C\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.50}% 50 +\BOOKMARK [3][-]{section*.54}{\376\377\000L\000i\000v\000e\000\040\000T\000w\000e\000e\000t\000\040\000C\000o\000l\000l\000e\000c\000t\000i\000o\000n}{section*.50}% 51 +\BOOKMARK [2][-]{section*.55}{\376\377\000D\000a\000t\000a\000\040\000p\000r\000e\000-\000p\000r\000o\000c\000e\000s\000s\000i\000n\000g}{section*.49}% 52 +\BOOKMARK [2][-]{section*.56}{\376\377\000S\000p\000a\000m\000\040\000F\000i\000l\000t\000e\000r\000i\000n\000g}{section*.49}% 53 +\BOOKMARK [2][-]{section*.57}{\376\377\000S\000e\000n\000t\000i\000m\000e\000n\000t\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section*.49}% 54 +\BOOKMARK [3][-]{section*.58}{\376\377\000V\000A\000D\000E\000R}{section*.57}% 55 +\BOOKMARK [2][-]{section*.59}{\376\377\000R\000e\000c\000u\000r\000r\000e\000n\000t\000\040\000N\000e\000u\000r\000a\000l\000\040\000N\000e\000t\000w\000o\000r\000k\000\040\000-\000\040\000L\000S\000T\000M}{section*.49}% 56 +\BOOKMARK [3][-]{section*.60}{\376\377\000T\000r\000a\000i\000n\000i\000n\000g\000\040\000a\000n\000d\000\040\000T\000e\000s\000t\000i\000n\000g\000\040\000M\000o\000d\000e\000l}{section*.59}% 57 +\BOOKMARK [3][-]{section*.61}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000a\000n\000d\000\040\000V\000a\000l\000i\000d\000a\000t\000i\000o\000n}{section*.59}% 58 +\BOOKMARK [2][-]{section*.62}{\376\377\000F\000u\000t\000u\000r\000e\000\040\000P\000r\000e\000d\000i\000c\000t\000i\000o\000n\000\040\000F\000o\000r\000e\000c\000a\000s\000t\000i\000n\000g}{section*.49}% 59 +\BOOKMARK [2][-]{section*.63}{\376\377\000M\000a\000i\000n\000\040\000F\000i\000l\000e\000\040\000'\000M\000a\000i\000n\000.\000p\000y\000'}{section*.49}% 60 +\BOOKMARK [2][-]{section*.64}{\376\377\000M\000i\000s\000c\000e\000l\000l\000a\000n\000e\000o\000u\000s}{section*.49}% 61 +\BOOKMARK [1][-]{section*.65}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000M\000e\000t\000r\000i\000c\000s\000\040\000a\000n\000d\000\040\000A\000c\000c\000u\000r\000a\000c\000y}{}% 62 +\BOOKMARK [1][-]{section*.66}{\376\377\000P\000r\000o\000j\000e\000c\000t\000\040\000E\000v\000a\000l\000u\000a\000t\000i\000o\000n}{}% 63 +\BOOKMARK [1][-]{section*.67}{\376\377\000D\000i\000s\000c\000u\000s\000s\000i\000o\000n\000:\000\040\000C\000o\000n\000t\000r\000i\000b\000u\000t\000i\000o\000n\000\040\000a\000n\000d\000\040\000R\000e\000f\000l\000e\000c\000t\000i\000o\000n}{}% 64 +\BOOKMARK [2][-]{section*.68}{\376\377\000L\000i\000m\000i\000t\000a\000t\000i\000o\000n\000s}{section*.67}% 65 +\BOOKMARK [1][-]{section*.69}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000I\000m\000p\000r\000o\000v\000e\000m\000e\000n\000t\000s}{}% 66 +\BOOKMARK [2][-]{section*.70}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n}{section*.69}% 67 +\BOOKMARK [2][-]{section*.71}{\376\377\000F\000u\000t\000u\000r\000e\000\040\000I\000m\000p\000r\000o\000v\000e\000m\000e\000n\000t\000s}{section*.69}% 68 +\BOOKMARK [1][-]{section*.73}{\376\377\000A\000p\000p\000e\000n\000d\000i\000c\000e\000s}{}% 69 +\BOOKMARK [2][-]{section*.74}{\376\377\000A\000p\000p\000e\000n\000d\000i\000x\000\040\000A\000\040\000-\000\040\000P\000r\000o\000j\000e\000c\000t\000\040\000I\000n\000i\000t\000i\000a\000t\000i\000o\000n\000\040\000D\000o\000c\000u\000m\000e\000n\000t}{section*.73}% 70 +\BOOKMARK [2][-]{section*.75}{\376\377\000A\000p\000p\000e\000n\000d\000i\000x\000\040\000B\000\040\000-\000\040\000L\000o\000g\000\040\000b\000o\000o\000k}{section*.73}% 71 diff --git a/document.pdf b/document.pdf index 18b0a75..c82946b 100644 Binary files a/document.pdf and b/document.pdf differ diff --git a/document.synctex.gz b/document.synctex.gz index c69833a..a9dd93f 100644 Binary files a/document.synctex.gz and b/document.synctex.gz differ diff --git a/document.tex b/document.tex index 4e581f5..4277f57 100644 --- a/document.tex +++ b/document.tex @@ -478,7 +478,35 @@ The naive Bayes approach has many applications, especially for the topic of this project in classifying the probability occurrence of the next price. Although it is a robust algorithm has its drawbacks which make it not as suitable as a neural network for the given need of this project. The naive Bayes trap is an issue that may occur due to the size of the dataset that will be used. There are however other scenarios this algorithm could be used such as classification of spam data.\cite{32} - \subsection{Random Forest} + \subsection{Bag Of Words} + The Bag Of Words algorithm counts the occurance ('Term-Frequency') of a word in a given text or document. The counts allow the comparison for text classification and is used prior to TF-IDF (detailed below) to aid in identifying the probability of words in a given text and classify accordingly. \cite{33} + + \[P(w) \ and\ P(w|spam) = \frac{Total\ number\ of\ occurrences\ of\ w\ in\ dataset}{Total\ number\ of\ words\ in\ dataset}\] + + \subsection{TF-IDF} + Stands for Term Frequency-Inverse Document Frequency is another way similar to Bag of Words that are used to judge the topic of a given text. Each word is given a weight (relevance not frequency) for how many times it occurs in the given text \cite{33}. + Term-frequency measures the number of times that a word appears in the text, but due to words such as 'and', 'the' and 'a' can frequently appear in text Inverse Document Frequency is used to change the weight of words that appear the most. Therefore words that appear the most are signalled to be less important and valuable, and therefore will not be used for classifications when used with such models as Naive Bayes for a given purpose. \cite{33} + \newline + + IDF is defined as: + + \[IDF(w) = log\frac{Total\ number\ of\ messages}{Total\ number\ of\ messages\ containing\ w}\] + + TF-IDF is thus defined as both: + + \[P(w) = \frac{TF(w)*IDF(w)}{\sum _{a\ words\ x\ \epsilon\ train\ dataset} TF(x)*IDF(x)}\] + + \[P(w|spam) = \frac{TF(w)*IDF(w)}{\sum _{a\ words\ x\ \epsilon\ train\ dataset} TF(x|spam)*IDF(x)}\] + + \cite{34} + + \subsection{Addictive Smoothing} + Used alongside Bag Of Words, is a method of handling data that is in the test data but not in the training dataset. In case of $P(w)$ it would evaluate to 0, which will make the $P(w|spam)$ undefined as it will not be able to classify the word. Addictive smoothing tackles this by adding a number $\alpha$ to the numerator, and adding $alpha$ time the number of classes over the probability that is found in the denominator. \cite{34} + \newline + + For TF-IDF: + + \[P(w|spam) = \frac{TF(w)*IDF(w) + \alpha}{\sum _{a\ words\ x\ \epsilon\ train\ dataset} TF(x|spam)*IDF(x) + \alpha\sum_{a\ words\ x\ \epsilon\ train\ dataset}}\] \newpage @@ -503,7 +531,7 @@ \textbf{Tweets} \newline \newline - Historical tweets can be obtained through the Twitter API, and however is not a feature of the Tweepy package - \textit{not mentioned or method on official Tweepy Documentation} \cite{33}. The Twitter API, as explained in the Literature review, allows for historical tweets to be extracted from the platform, 100 per request and a maximum of 50 requests per month. This proposes an issue with not providing enough data, where the sentiment will need to be calculated per hour. Simply put, for a year of hourly price data, there will be 9050 records. Therefore the equivalent will be required for sentiment; however, the sentiment will be the average the sentiment per hour of tweets. Using a single request with 100 tweets per hour, per hour; 905,000 tweets will need to be extracted to provide the data required. A solution to this issue could be to use and create multiple accounts and manually extract data from the API and merge. Another option is the pay for the data from 3rd party companies who have access to the Enterprise API and can pull more data, 2000 per request \\cite{7}\cite{8}. Due to the price for data of these 3rd parties the former could be a suitable, but more time-consuming option. + Historical tweets can be obtained through the Twitter API, and however is not a feature of the Tweepy package - \textit{not mentioned or method on official Tweepy Documentation} \cite{35}. The Twitter API, as explained in the Literature review, allows for historical tweets to be extracted from the platform, 100 per request and a maximum of 50 requests per month. This proposes an issue with not providing enough data, where the sentiment will need to be calculated per hour. Simply put, for a year of hourly price data, there will be 9050 records. Therefore the equivalent will be required for sentiment; however, the sentiment will be the average the sentiment per hour of tweets. Using a single request with 100 tweets per hour, per hour; 905,000 tweets will need to be extracted to provide the data required. A solution to this issue could be to use and create multiple accounts and manually extract data from the API and merge. Another option is the pay for the data from 3rd party companies who have access to the Enterprise API and can pull more data, 2000 per request \cite{7}\cite{8}. Due to the price for data of these 3rd parties the former could be a suitable, but more time-consuming option. Live tweets can be collected by two methods from Twitter, from the Twitter API and using Twitter Python package such as Tweepy, detailed in the Literature review. Additionally, the limitations of the Twitter API are also discussed in the literature review which states how the Twitter API has a tiering system: Standard, Premium and Enterprise. Each tier has different levels of access to the API and can extract varying amounts of data from the platform. Thus concluding the section in the Literature review, the Twitter API will not be used for the extraction and streaming of live tweets due to it being restricted to Enterprise users. Therefore, Tweepy will be used to set up a looping authenticated streaming solution with the Twitter API which will allow the access of current recurring data. Natural language pre-processing will be apart of most systems in this project. Techniques such as tokenisation, stemming, stopword removal and character filtering will be prevalent, as these will be used to remove unwanted data and to sanitise the data for classification. @@ -571,7 +599,7 @@ \multirow{3}{*}{Pytorch} & Graph definition is more imperative and dynamic than other frameworks & Not as widley adopted as TensorFlow \\ & Graph computation defined at runtime, allowing standard popular IDEs to support it & Visualisation is not as robust as TensorBoard \\ & Natively support common python deployment frameworks such as Flask & Not as deployable as TensorFlow, doesn't supper gRPC \\ & & \\ \end{tabular}} - \textbf{Comparison between TensorFlow, Theano and Pytorch}\cite{34} + \textbf{Comparison between TensorFlow, Theano and Pytorch}\cite{36} \end{table} Due to the continued support and development of TensorFlow, the board community and support of a high-level wrapper - Keras, this library will be used for this project. Although, Pytorch is a good alternative it is not as easy to use as implement when compared to TensorFlow using Keras. @@ -659,6 +687,8 @@ \item Storage - The polarity classification and tweets are then saved to their relevant CSV files for historical and live data. \end{itemize} + \newpage + \textbf{Neural Network} \begin{center} \includegraphics[width=17cm,height=12cm]{images/Neural_Network.png} @@ -723,7 +753,7 @@ \subsection{Data collection}\label{collection} \subsubsection{Price Time-Series Historical Data} - Historical price data were extracted from a CSV historical price tracker, \textit{Bitcoin Charts} \cite{35}. This tracker provided the historical data from the three exchanges used for Live price collection - Coinbase, Bitfinex and Gemini, since the exchanges supported the cryptocurrency. The data used spans from \textit{2018-01-06} to \textit{2019-01-06}. + Historical price data were extracted from a CSV historical price tracker, \textit{Bitcoin Charts} \cite{37}. This tracker provided the historical data from the three exchanges used for Live price collection - Coinbase, Bitfinex and Gemini, since the exchanges supported the cryptocurrency. The data used spans from \textit{2018-01-06} to \textit{2019-01-06}. \begin{lstlisting}[language=Python, caption=Historical price collection and averaging per exchange] ... @@ -1116,9 +1146,10 @@ class utilityFuncs(): \newline \textbf{Language detection filtering} + This feature of the system is used as an additional filter for filtering out non-English tweets. As discussed in the solution approach, Tweepy/Twitter API provides a means to filter out non-English based tweets, this, however, doesn't work if the user has settings on Twitter set to be English as a prefered language and the region 'en'. Due to this non-English characters can still be within collected tweets; thus these are detected and filtered with the below function. - \begin{lstlisting}[language=python, caption=Language detection and filter function \cite{36}] + \begin{lstlisting}[language=python, caption=Language detection and filter function \cite{38}] def detectLaguage(self, text): """ Calculate the probability of given text is written in several languages @@ -1157,14 +1188,69 @@ def detectLaguage(self, text): # If text is not predominately English drop tweet \end{lstlisting} - This function uses several natural languages pre-processing techniques to identify the most predominant language for a given text. It accomplishes this by first tokenising the text into tokens and converting them to lower case - this is so that the stopwords can be identified. For each of the languages supported by the Natural Language Toolkit Python package, the stopwords are identified in the text and compared to the stopwords in the language corpus' in NLTK. The ratios for the individual languages are formed, and then the predominant language identified. If the language is not predominantly English, the tweet is dropped. + This function uses several natural languages pre-processing techniques to identify the most predominant language for a given text. It accomplishes this by first tokenising the text into tokens and converting them to lower case - this is so that the stopwords can be identified. For each of the languages supported by the Natural Language Toolkit Python package, the stopwords are identified in the text and compared to the stopwords in the language corpus' in NLTK. The ratios for the individual languages are formed, and then the predominant language identified. If the language is not predominantly English, the tweet is dropped. There is however an issue with this approach, if a tweet contains too many special characters - characters that are allowed, the tweet occasionally is not classified as English even when it predominantly is upon visual inspection; therefore the tweet is dropped and not processed. This isn't a significant issue as about 3000 tweets can be collected in an hour, and some of these would be filtered out by the spam filter regardless. - Ngrams could also be used + Additionally, an n-grams method could be used to distinguish the language of a given text and may perform more accurately than the word-based approach that was implemented \cite{39}. This could be a later improvement as the word-based approach is sufficient and requires a corpus for each language to compare against to be presented. Therefore it could be used as a comparison between approaches and seen as a possible improvement. + \newline \textbf{Spam filter - Tokenisation, Ngrams, Stopword removal and Stemming} - \subsubsection{Spam Filtering} - \begin{lstlisting}[language=python, caption=Spam filter training Class] + Prior to any text being processed to both train the Naive Bayes classifier of the spam filter or to classify live tweets, the data needs to be pre-processed to extract the features from the text so that the classifier can identify the probability of each word in the given text. The explanation of how this classifier functions will be detailed in the 'Spam Filtering' Section. + + \begin{lstlisting}[language=python, caption=pre-processing of data prior to being used by the spam filter] +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +... + +def processTweet(tweet, gram = 2): + tweet = tweet.lower() # convert to lower case + + words = word_tokenize(tweet) # Tokenise words in text + words = [w for w in words if len(w) > 2] + # remove words that are not greater than 2 characters + + if gram > 2: ## Increasing grams can increase accuracy + w = [] + for i in range(len(words) - gram + 1): + w += [' '.join(words[i:i + gram])] + return w + + # Remove stopwords + sw = stopwords.words('english') + words = [word for word in words if word not in sw] + # Create new dict without stopwords + + stemmer = PorterStemmer() # Stem words + words = [stemmer.stem(word) for word in words] + # Create new dict of stemmed words + + return words + \end{lstlisting} + + The actions performed on the text consist of: + \begin{itemize} + \item Convert to lower case: This is due to that 'DROP' and 'drop', and likewise words convay the same meaning thus these are simply converted to all lower case. + \item Tokenise words: This splits the text into individual words. A dictionary is then created from the tokens that are above the length of 2 - due to words that are of less 'is', 'he' and 'if' will not contribute to spam detection and are seen as generic words in the English language + \item Ngrams: This is implemented to provide richer word sequences for the spam filter classification, as explained in the litrature review use of ngrams can increase accuracy. + \item Stop words Removal: This removes stopwords such as 'this', 'we' and 'now' from the text. Due to these common words carrying less importance for sentiment analysis. + \item Stemming: Reduces words down to their smaller form, as in it remove suffixes from inflected words - 'studying' become 'study' \cite{40}. The Porter Stemmer works by removing the suffixes from the text - 'going' becomes 'go', however, this applies to other words such as 'leaves' becomes 'leav' which is not a word. However, this method will be applied equally to all words containing such suffixes so all variations will become so, thus still allowing the probability classifications to occur on the word as all variations will be the same. + \end{itemize} + + As discovered from \cite{40}, lemmatisation could be an alternative and arguably a better solution to stemming. Lemmatization works fundamentally the same as stemming but reduces the inflected words properly ensuring that a root word belongs to a language. Using the same words that are used to describe stemming, lemmatisation reduces 'goes' to 'go' and 'leaves' to 'leaf' - removing the suffixes down to create the actual root word. + Although lemmatisation will provide the classifier with an actual English word, regardless stemming will still reduce the words down to the same form, this added with a lemmatiser needing a corpus for classifying the words to their root words and additional computational time to do so, the former of using a stemmer is sufficient. + + \subsection{Spam Filtering} + This section of the implementation will describe how the spam filter is initialised in the \textit{tweet\_collector}, how it is trained and how it classifies tweets as being either spam or ham (wanted data). + + \textit{Listing 12} shows the initalisation and method functions used within the \textit{tweet\_collector}, that creates the training and testing datasets, and tests classifier on hard specified tweets and checks their classification. + + \begin{lstlisting}[language=python, caption=Spam filter training Class - \textit{tweet\_collector.py}] +import pandas as pd +import spam_filter +import numpy as np +... + class filterSpam(object): def __init__(self, training_set): @@ -1210,6 +1296,7 @@ def train(self): # Train def testData_Prediction(self): + # Classify data from test dataset prediction = self.spamFilter.predict(self.testData['tweet']) return prediction @@ -1219,9 +1306,12 @@ def testPrediction(self): spam = spam_filter.processTweet("Earn more than 0015 btc free No deposit No investment Free Bitcoins - Earn $65 free btc in 5 minutes bitcoin freebtc getbtc") ham = spam_filter.processTweet("Bitcoin closed with some gains in month of February") - + # Process Tweets - Tokenise and Stem + + hamTweet = self.spamFilter.classify(ham) spamTweet = self.spamFilter.classify(spam) + # Classify both tweets print("Console: ", "Spam Tweet -- ", spamTweet) sys.stdout.flush() @@ -1229,17 +1319,136 @@ def testPrediction(self): sys.stdout.flush() def filterStatistics(self, prediction): + # Get performance metrics for prediction data compared to actual test data spam_filter.metrics(self.testData['class'], prediction) def testTweet(self, tweet): - + # Used for live tweets classification processed = spam_filter.processTweet(tweet) classified = self.spamFilter.classify(processed) return classified - \end{lstlisting} - - \textbf{Naive Bayes model} + \end{lstlisting} + + \begin{itemize} + \item filterSpam - \_\_init\_\_: is called when the \textit{tweet\_collector} script is first executed which initialises the object, first described in the 'Live Tweet Collection' section above. + \item trainFilter: is a function that calls the dataset function which created the training and testing dataset, followed by the train function which trains the initialised classifier. This function's sole purpose is to serve as a parent function that only needs to be called to perform the child functions once. + \item dataset: This function loads the pre-labelled spam dataset, remaps the labels to integers 0:1 to ham:spam respectively, creates a dictionary with an index of 75\% of the original data for the training dataset and 25\% for the testing dataset. This function does this by extracting the data at the set point from the spam dataset into the relevant new datasets which resetting indexes and dropping old columns to form appropriate data. + \item train: Is used to call the classifier function defined in the \textit{spam\_filter} script and passes the training data for it to initialise then train on. + \item testData\_Prediction: Is a function similar to the 'train' function, but calls the 'predict' function defined in \textit{spam\_filter} to test the classifier on the test data and returns the predictions made, which is used later on in the 'filterStatistics' function to calculate the accuracy of the classifier. + \item testPredictions: This function is used to test the accuracy of the trained classifier with pre-defined tweets that are assumed to be either spam or ham. The primary goal of this function is to ensure that the classifier correctly classifies the two tweets as either spam or ham appropriately. The text is processed through the 'processTweet' function previously described to transform the tweets into tokens ready for classification. + \item filterStatistics: Is used by the 'testData\_Predictions' function to calculate the accuracy of the classification model using the test data and prediction data. The 'metrics' function is defined in the \textit{spam\_filter} script. + \item testTweet: Is a function used on the live tweets by the 'on\_data' function also outlined previously to process the tweets data and classify it as either being spam or not, the 'on\_data' function then handles the result accordingly. + \end{itemize} + + \textbf{Naive Bayes model} + + The spam filter classifier, using a Naive Bayes model, was coded from scratch. Ultimately unneeded as the Scikit-learn python package comes with four inbuilt Naive Bayes classification models (Bernoulli, Complement, Multinomial, Gaussian)\cite{41}. The Naive Bayes model implemented was a multinomial Bayes model as the data used for classification was of multinomial distribution and categorical. This algorithm was not compared to the Scikit-learn's inbuilt model for accuracy as this was not the focus of this project. The model was coded from scratch due to finding information on how this would be done with techniques such as TFIDF and Additive Smoothing as detailed in the literature review, the tutorial that helped the greatest \textit{Spam Classifier in Python from scratch} \cite{34} \cite{42}. For an explanation of how the maths work behind this classifier see Literature review sections 'Bag Of Words', 'TF-IDF' and 'Addictive Smoothing'. + + \begin{lstlisting}[language=python, caption=classifer class of spam\_filter.py] +class classifier(object): + def __init__(self, trainData): + self.tweet = trainData['tweet'] + self.labels = trainData['class'] + + def TF_and_IDF(self): + noTweets = self.tweet.shape[0] + self.spam = self.labels.value_counts()[1] + self.ham = self.labels.value_counts()[0] + self.total = self.spam + self.ham + + # Initialise spam vars + self.spamCount = 0 + self.hamCount = 0 + self.tfSpam = dict() + self.tfHam = dict() + self.idfSpam = dict() + self.idfHam = dict() + + # Bag Of Words implementation - pro + + for entry in range(noTweets): + processed = processTweet(self.tweet[entry]) + count = list() + #To keep track of whether the word has ocured in the message or not. IDF count + + for word in processed: + if self.labels[entry]: + self.tfSpam[word] = self.tfSpam.get(word, 0) + 1 + self.spamCount += 1 + ## If label for data is spam then add words to spam list + else: + self.tfHam[word] = self.tfHam.get(word, 0) + 1 + self.hamCount += 1 + # If label for data is ham then add words to ham list + # Addictive Smoothing - if current word is not seen add count list + if word not in count: + count += [word] + for word in count: + # Loop unseen word list + if self.labels[entry]: + self.idfSpam[word] = self.idfSpam.get(word, 0) + 1 + else: + self.idfHam[word] = self.idfHam.get(word, 0) + 1 + + def TF_IDF(self): + self.probSpam = dict() + self.probHam = dict() + self.sumSpam = 0 + self.sumHam = 0 + + # Calculate probability of word being spam or ham based on occurance in text compared to counted sets along with relevant keys + for word in self.tfSpam: + self.probSpam[word] = (self.tfSpam[word]) * log((self.spam + self.ham) / (self.idfSpam[word] + self.idfHam.get(word, 0))) + self.sumSpam += self.probSpam[word] + + for word in self.tfSpam: + self.probSpam[word] = (self.probSpam[word] + 1) / (self.sumSpam + len(list(self.probSpam.keys()))) + + for word in self.tfHam: + self.probHam[word] = (self.tfHam[word]) * log((self.spam + self.ham) / (self.idfSpam.get(word, 0) + self.idfHam[word])) + self.sumHam += self.probHam[word] + for word in self.tfHam: + self.probHam[word] = (self.probHam[word] + 1) / (self.sumHam + len(list(self.probHam.keys()))) + + # Calculate total amount of spam words identified + self.probSpamTotal, self.probHamTotal = self.spam / self.total, self.ham / self.total + \end{lstlisting} + + \textbf{Classification} + + This function aims to classify the pre-processed tweet data as either spam or ham based on the term-frequency and probabilities calculated in the 'TF\_IDF' function. This conducted for each word in the processed tweet is identified if the word is contained in the spam set, based on the level of occurrence the probability is assigned a weight (The more it occures, the more likely it is a generic word), this is also identified for the level of occurrence in the ham set. Totals for the probability are formed, and the total count for both spam and ham are added to the spam and ham probabilities for the processed tweet. If the spam probability \textit{pSpam} is higher than the ham probability \textit{pHam} based on the level of occurrence of each word in the modelled respective sets, a boolean is returned based on which probability is higher - which identifies if the tweet is predominantly spam or ham (\textit{True} or \textit{False}). + + \begin{lstlisting}[language=python, caption=Classify Function of Parent classifier class of spam\_filter.py] +def classify(self, processed): + pSpam, pHam = 0, 0 + + for word in processed: + if word in self.probSpam: + pSpam += log(self.probSpam[word]) + else: + pSpam -= log(self.sumSpam + len(list(self.probSpam.keys()))) + if word in self.probHam: + pHam += log(self.probHam[word]) + else: + pHam -= log(self.sumHam + len(list(self.probHam.keys()))) + pSpam += log(self.probSpamTotal) + pHam += log(self.probHamTotal) + return pSpam >= pHam + \end{lstlisting} + + \textbf{Predict} + + \begin{lstlisting}[language=python, caption=Predict function of parent classifier class of spam\_filter.py] +def predict(self, testData): + result = dict() + for (i, tweet) in enumerate(testData): + processed = processTweet(tweet) + result[i] = int(self.classify(processed)) + return result + \end{lstlisting} + + \textbf{Metrics} \subsection{Sentiment Analysis} \subsubsection{VADER} @@ -1290,6 +1499,9 @@ def testTweet(self, tweet): Another could be to predict the hour of sentiment and create a threshold for it. + Identify whether or not use of ngrams improved accuracy of spam classification + + Identify whether use lemmatisation would change how spam classification occured \newpage \nocite{*} diff --git a/document.toc b/document.toc index 36a2a99..d4478a4 100644 --- a/document.toc +++ b/document.toc @@ -54,86 +54,90 @@ \defcounter {refsection}{0}\relax \contentsline {subsubsection}{Naive Bayes}{27}{section*.28} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Random Forest}{28}{section*.29} +\contentsline {subsection}{Bag Of Words}{28}{section*.29} \defcounter {refsection}{0}\relax -\contentsline {section}{Solution Approach}{29}{section*.30} +\contentsline {subsection}{TF-IDF}{28}{section*.30} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Data gathering}{29}{section*.31} +\contentsline {subsection}{Addictive Smoothing}{29}{section*.31} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Data pre-processing}{30}{section*.32} +\contentsline {section}{Solution Approach}{30}{section*.32} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Spam Filtering}{30}{section*.33} +\contentsline {subsection}{Data gathering}{30}{section*.33} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Language Detection}{31}{section*.34} +\contentsline {subsection}{Data pre-processing}{31}{section*.34} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Sentiment Analysis}{31}{section*.35} +\contentsline {subsection}{Spam Filtering}{31}{section*.35} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Neural Network}{32}{section*.36} +\contentsline {subsection}{Language Detection}{32}{section*.36} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Price Forecasting}{34}{section*.38} +\contentsline {subsection}{Sentiment Analysis}{32}{section*.37} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Frontend Application}{34}{section*.39} +\contentsline {subsection}{Neural Network}{33}{section*.38} \defcounter {refsection}{0}\relax -\contentsline {subsection}{With reference to Initial PID}{34}{section*.40} +\contentsline {subsection}{Price Forecasting}{35}{section*.40} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Solution Summary}{35}{section*.41} +\contentsline {subsection}{Frontend Application}{35}{section*.41} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Data flow Overview}{36}{section*.42} +\contentsline {subsection}{With reference to Initial PID}{35}{section*.42} \defcounter {refsection}{0}\relax -\contentsline {section}{System Design}{37}{section*.43} +\contentsline {subsection}{Solution Summary}{36}{section*.43} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Dataflow Designs}{37}{section*.44} +\contentsline {subsection}{Data flow Overview}{37}{section*.44} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Interface Design}{44}{section*.45} +\contentsline {section}{System Design}{38}{section*.45} \defcounter {refsection}{0}\relax -\contentsline {section}{Implementation}{45}{section*.47} +\contentsline {subsection}{Dataflow Designs}{38}{section*.46} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Data collection}{45}{section*.48} +\contentsline {subsection}{Interface Design}{45}{section*.47} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Price Time-Series Historical Data}{45}{section*.49} +\contentsline {section}{Implementation}{46}{section*.49} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Price Time-Series Live Data}{46}{section*.50} +\contentsline {subsection}{Data collection}{46}{section*.50} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Historical Tweet Collection}{48}{section*.51} +\contentsline {subsubsection}{Price Time-Series Historical Data}{46}{section*.51} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Live Tweet Collection}{50}{section*.52} +\contentsline {subsubsection}{Price Time-Series Live Data}{47}{section*.52} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Data pre-processing}{55}{section*.53} +\contentsline {subsubsection}{Historical Tweet Collection}{49}{section*.53} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Spam Filtering}{57}{section*.54} +\contentsline {subsubsection}{Live Tweet Collection}{51}{section*.54} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Sentiment Analysis}{59}{section*.55} +\contentsline {subsection}{Data pre-processing}{56}{section*.55} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{VADER}{59}{section*.56} +\contentsline {subsection}{Spam Filtering}{60}{section*.56} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Recurrent Neural Network - LSTM}{59}{section*.57} +\contentsline {subsection}{Sentiment Analysis}{66}{section*.57} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Training and Testing Model}{59}{section*.58} +\contentsline {subsubsection}{VADER}{66}{section*.58} \defcounter {refsection}{0}\relax -\contentsline {subsubsection}{Scoring and Validation}{59}{section*.59} +\contentsline {subsection}{Recurrent Neural Network - LSTM}{66}{section*.59} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Future Prediction Forecasting}{59}{section*.60} +\contentsline {subsubsection}{Training and Testing Model}{66}{section*.60} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Main File 'Main.py'}{59}{section*.61} +\contentsline {subsubsection}{Scoring and Validation}{66}{section*.61} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Miscellaneous}{59}{section*.62} +\contentsline {subsection}{Future Prediction Forecasting}{66}{section*.62} \defcounter {refsection}{0}\relax -\contentsline {section}{Testing Metrics and Accuracy}{60}{section*.63} +\contentsline {subsection}{Main File 'Main.py'}{66}{section*.63} \defcounter {refsection}{0}\relax -\contentsline {section}{Project Evaluation}{61}{section*.64} +\contentsline {subsection}{Miscellaneous}{66}{section*.64} \defcounter {refsection}{0}\relax -\contentsline {section}{Discussion: Contribution and Reflection}{61}{section*.65} +\contentsline {section}{Testing Metrics and Accuracy}{67}{section*.65} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Limitations}{61}{section*.66} +\contentsline {section}{Project Evaluation}{68}{section*.66} \defcounter {refsection}{0}\relax -\contentsline {section}{Conclusion and Future Improvements}{62}{section*.67} +\contentsline {section}{Discussion: Contribution and Reflection}{68}{section*.67} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Conclusion}{62}{section*.68} +\contentsline {subsection}{Limitations}{68}{section*.68} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Future Improvements}{62}{section*.69} +\contentsline {section}{Conclusion and Future Improvements}{69}{section*.69} \defcounter {refsection}{0}\relax -\contentsline {section}{Appendices}{67}{section*.71} +\contentsline {subsection}{Conclusion}{69}{section*.70} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Appendix A - Project Initiation Document}{67}{section*.72} +\contentsline {subsection}{Future Improvements}{69}{section*.71} \defcounter {refsection}{0}\relax -\contentsline {subsection}{Appendix B - Log book}{80}{section*.73} +\contentsline {section}{Appendices}{74}{section*.73} +\defcounter {refsection}{0}\relax +\contentsline {subsection}{Appendix A - Project Initiation Document}{74}{section*.74} +\defcounter {refsection}{0}\relax +\contentsline {subsection}{Appendix B - Log book}{87}{section*.75} diff --git a/report.bib b/report.bib index 0d42c66..1d66168 100644 --- a/report.bib +++ b/report.bib @@ -318,6 +318,26 @@ } @inproceedings{33, + title={A Beginner's Guide to Bag of Words and TF-IDF}, + author={Skymind}, + booktitle={A.I Wiki}, + pages={}, + year={2018}, + organization={Skymind}, + url={https://skymind.ai/wiki/bagofwords-tf-idf} +} + +@inproceedings{34, + title={Spam Classifier in Python from scratch}, + author={Tejan Karmali}, + booktitle={}, + pages={}, + year={Aug 2, 2017}, + organization={Towards Data Science}, + url={https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73} +} + +@inproceedings{35, title={Tweepy Documentation}, author={Joshua Roesslein}, booktitle={}, @@ -328,7 +348,7 @@ url={http://docs.tweepy.org/en/v3.5.0/} } -@inproceedings{34, +@inproceedings{36, title={Tensorflow Vs. Theano: What Do Researchers Prefer As An Artificial Intelligence Framework}, author={Srishti Deoras}, booktitle={}, @@ -340,7 +360,7 @@ url={https://www.analyticsindiamag.com/tensorflow-vs-theano-researchers-prefer-artificial-intelligence-framework} } -@inproceedings{35, +@inproceedings{37, title={}, author={bitcoincharts}, booktitle={}, @@ -350,7 +370,7 @@ url={http://api.bitcoincharts.com/v1/csv/} } -@inproceedings{36, +@inproceedings{38, title={Detecting Text Language With Python and NLTK}, author={Alejandro Nolla}, booktitle={}, @@ -358,4 +378,44 @@ year={}, organization={Alejandro Nolla Blog}, url={http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/} +} + +@inproceedings{39, + title={A tutorial on Automatic Language Identification - ngram based}, + author={Practical Cryptography}, + booktitle={}, + pages={}, + year={}, + organization={Practical Cryptography}, + url={http://practicalcryptography.com/miscellaneous/machine-learning/tutorial-automatic-language-identification-ngram-b/} +} + +@inproceedings{40, + title={What is the difference between stemming and lemmatization}, + author={Tita Risueno}, + booktitle={}, + pages={}, + year={Feb 26, 2018}, + organization={Bitext}, + url={https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/} +} + +@inproceedings{41, + title={Naive Bayes}, + author={scikit-learn developers}, + booktitle={}, + pages={}, + year={}, + organization={Scikit-Learn}, + url={https://scikit-learn.org/stable/modules/naive_bayes.html} +} + +@inproceedings{42, + title={Spam-or-Ham}, + author={Tejan Karmali - tejank10}, + booktitle={}, + pages={}, + year={Aug 2, 2017}, + organization={Github}, + url={https://github.com/tejank10/Spam-or-Ham} } \ No newline at end of file