{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e070e325", "metadata": {}, "outputs": [], "source": [ "\n", "\"\"\"Sentiment Analysis of Financial News Headlines with Market Comparison\"\"\"\n", "\"\"\"Cardiff University School of Computer Science and Informatics Final Year Project\"\"\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a311a288", "metadata": {}, "outputs": [], "source": [ "\"\"\"Collect Headline Data\"\"\"\n", "\n", "import pandas\n", "\n", "cnbcData = pandas.read_csv('Data/Original/cnbc_headlines.csv')\n", "guardianData = pandas.read_csv('Data/Original/guardian_headlines.csv')\n", "reutersData = pandas.read_csv('Data/Original/reuters_headlines.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f534c23d", "metadata": {}, "outputs": [], "source": [ "\"\"\"Initial Clean Headline Data\"\"\"\n", "\n", "from datetime import datetime, timedelta\n", "\n", "#Removing Redundant Data\n", "def cleanData(dataSet):\n", " dataSet = dataSet.dropna()\n", " try:\n", " dataSet = dataSet.drop('Description', axis=1)\n", " except:\n", " print(\"Headline data set contains no description\")\n", " dataSet = dataSet.drop_duplicates(subset=['Headlines'], keep='first')\n", " dataSet.reset_index(drop=True, inplace=True)\n", " return dataSet\n", "\n", "#Converting Date Values To 'datetime64' Format\n", "def dateConversion(date):\n", " date = date.replace(\"Sept\", \"Sep\").replace(\"March\", \"Mar\").replace(\"April\", \"Apr\").replace(\"June\", \"Jun\").replace(\"July\", \"Jul\")\n", " if date[0].isspace():\n", " date = date.replace(\" \", \"0\", 1)\n", " date = date.replace(\", \", \", 0\", 1)\n", " return date\n", "\n", "#CNBC\n", "\n", "cnbcData = cleanData(cnbcData)\n", "dateFormat = '%I:%M %p ET %a, %d %b %Y'\n", "dates = []\n", "for item in cnbcData.iloc[:, 1].values:\n", " item = dateConversion(item)\n", " dates.append(datetime.strptime(item, dateFormat).strftime(\"%m-%d-%Y\"))\n", "cnbcData[\"Time\"] = dates\n", "cnbcData[\"Time\"] = cnbcData[\"Time\"].astype(\"datetime64\")\n", "cnbcData.rename(columns={\"Time\":\"Date\"}, inplace = True)\n", "\n", "#Guardian\n", "\n", "guardianData = cleanData(guardianData)\n", "guardianData[\"Time\"] = pandas.to_datetime(guardianData[\"Time\"], errors = 'coerce', format=\"%d-%b-%y\")\n", "guardianData.rename(columns={\"Time\":\"Date\"}, inplace = True)\n", "\n", "#Reuters\n", "\n", "reutersData = cleanData(reutersData)\n", "reutersData[\"Time\"] = reutersData[\"Time\"].astype(\"datetime64\")\n", "reutersData.rename(columns={\"Time\":\"Date\"}, inplace = True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5d6f286b", "metadata": {}, "outputs": [], "source": [ "\"\"\"Combine Headline Data\"\"\"\n", "\n", "dataSets = [cnbcData, guardianData, reutersData]\n", "headlineData = pandas.concat(dataSets)\n", "print(\"Pre Cleaning: \")\n", "headlineData.info()\n", "headlineData = headlineData.sort_values(by=\"Date\")\n", "headlineData = cleanData(headlineData)\n", "print(\" \")\n", "print(\"Post Cleaning: \")\n", "headlineData.info()\n", "headlineData.to_csv('Data/all_headlines.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62d41e21", "metadata": {}, "outputs": [], "source": [ "\"\"\"Collect and Clean Market Data\"\"\"\n", "\n", "import yfinance\n", "\n", "def getMarketData(ticker):\n", " securityData = yfinance.download(ticker, start='2017-12-22', end='2020-07-18')\n", " securityData = securityData['Close']\n", " return securityData\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e06e125c", "metadata": {}, "outputs": [], "source": [ "\"\"\"Explority Data Analysis Investigating Distribution\"\"\"\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "fig, axs = plt.subplots(1, 2, figsize=(16, 2))\n", "\n", "#Time Series Plot\n", "spy = getMarketData('SPY')\n", "axs[0].plot(spy, 'tab:red')\n", "axs[0].set_title('S&P 500 Index Time Series')\n", "axs[0].axes.get_xaxis().set_ticks([])\n", "axs[0].set_xlabel('Date (December 2017 - July 2020)')\n", "axs[0].set_ylabel('Price (USD)')\n", "\n", "#Frequency Distribution Plot\n", "headlineQuantity = headlineData[\"Headlines\"].groupby([headlineData[\"Date\"].dt.year, headlineData[\"Date\"].dt.month]).count().tolist()\n", "axs[1].bar(list(range(len(headlineQuantity))), headlineQuantity)\n", "axs[1].set_title('Number Of Headlines Published Monthly')\n", "axs[1].axes.get_xaxis().set_ticks([])\n", "axs[1].set_xlabel('Month (December 2017 - July 2020)')\n", "axs[1].set_ylabel('Headlines')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "77e48bec", "metadata": {}, "outputs": [], "source": [ "\"\"\"Explority Data Analysis - Vocabulary Representation\"\"\"\n", "\n", "from wordcloud import WordCloud, STOPWORDS\n", "\n", "fig, axs = plt.subplots(1, 3, figsize=(17, 6))\n", "\n", "reutersDataString = \" \".join(reutersData[\"Headlines\"].to_list())\n", "guardianDataString = \" \".join(guardianData[\"Headlines\"].to_list())\n", "cnbcDataString = \" \".join(cnbcData[\"Headlines\"].to_list())\n", "#Generating Word Clouds For Each Dataset\n", "reutersWordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', max_words=250).generate(reutersDataString)\n", "guardianWordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', max_words=250).generate(guardianDataString)\n", "cnbcWordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', max_words=250).generate(cnbcDataString)\n", "\n", "#Word Cloud Plot\n", "axs[0].imshow(reutersWordcloud, interpolation='bilinear')\n", "axs[0].set_title('Reuters Word Cloud')\n", "axs[0].axis(\"off\")\n", "axs[1].imshow(guardianWordcloud, interpolation='bilinear')\n", "axs[1].set_title('Guardian Word Cloud')\n", "axs[1].axis(\"off\")\n", "axs[2].imshow(cnbcWordcloud, interpolation='bilinear')\n", "axs[2].set_title('CNBC Word Cloud')\n", "axs[2].axis(\"off\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4f3dc5f8", "metadata": {}, "outputs": [], "source": [ "\"\"\"Explority Data Analysis - Word Type Frequency\"\"\"\n", "\n", "import nltk, nltk.classify\n", "from collections import Counter\n", "import re\n", "\n", "typeList = []\n", "#Basic Pre-Processing\n", "headlineString = \" \".join(headlineData[\"Headlines\"])\n", "headlineString = re.sub(r'[^a-zA-Z]', \" \", headlineString.lower())\n", "tokenizedHeadlines = nltk.word_tokenize(str(headlineString))\n", "wordTypeList = nltk.pos_tag(tokenizedHeadlines) #Word Type Identfication\n", "for wordType in wordTypeList:\n", " typeList.append(wordType[1])\n", "wordsTypeFrequency = Counter(typeList) #Count Word Type Frequency\n", "wordsTypeFrequency = {k: v for k, v in sorted(wordsTypeFrequency.items(), key=lambda item: item[1])}\n", "\n", "label = list(wordsTypeFrequency.keys())\n", "frequency = list(wordsTypeFrequency.values())\n", " \n", "#Word Type Frequency Plot\n", "plt.figure(figsize=(16, 2))\n", "plt.bar(label, frequency)\n", "plt.xlabel('Word Type')\n", "plt.ylabel('Frequency')\n", "plt.title('Word Type And Respective Frequency Of Occurrence In All Headlines')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0abb2c76", "metadata": {}, "outputs": [], "source": [ "\"\"\"Pre-Processing\"\"\"\n", "\n", "from nltk.corpus import words, stopwords\n", "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", "\n", "lemmatizer = WordNetLemmatizer()\n", "stemmer = PorterStemmer()\n", "\n", "#nltk.download()\n", "\n", "def preProcessing(text):\n", " text = namedEntityRecognition(text)[0] #Named Entity Recognition\n", " text = re.sub(r'[^a-zA-Z]', \" \", text.lower()).split() #Formatting Words\n", " text = list(set(text) - set(str(stopwords))) #Stop Word Removal\n", " text = list(set(text) & set(words.words())) #Non-Word Removal \n", " for word in text:\n", " word = lemmatizer.lemmatize(word) #Lemmatization\n", " word = stemmer.stem(str(word)) #Stemming \n", " return text\n" ] }, { "cell_type": "code", "execution_count": null, "id": "580dcc26", "metadata": {}, "outputs": [], "source": [ "\"\"\"Named Entity Recognition\"\"\"\n", "\n", "import spacy\n", "from spacy import displacy\n", "\n", "NER = spacy.load(\"en_core_web_sm\")\n", "\n", "def namedEntityRecognition(text):\n", " selectedSecurities = {'S&P': 'SPY', 'google': 'GOOGL', 'amazon': 'AMZN', 'apple': 'AAPL',\n", " 'microsoft': 'MSFT','visa': 'V', 'johnson': 'JNJ', 'walmart': 'WMT',\n", " 'exxon': 'XOM', 'FB': 'facebook', 'TSLA': 'tesla'}\n", " securityNames = selectedSecurities.keys()\n", " text = text.lower().split()\n", " for word in text:\n", " word = stemmer.stem(str(word)) #Reduce Company To Stem\n", " securities = []\n", " for security in securityNames:\n", " if security in text:\n", " text[text.index(security)] = 'COMPANY' #Company Name Removal\n", " securities.append(selectedSecurities[security]) #Identify for Analysis \n", " if len(securities) == 0:\n", " securities.append('^GSPC')\n", " text = ' '.join(text)\n", " text = NER(text)\n", " text = ' '.join([t.text if not t.ent_type_ else t.ent_type_ for t in text]).lower()\n", " return text, securities\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2f3eadf5", "metadata": {}, "outputs": [], "source": [ "\"\"\"Generate Word Set (Bag-Of-Words Representation)\"\"\"\n", "\n", "import numpy \n", "import time\n", "\n", "wordSet = []\n", "start = time.time()\n", "for headline in headlineData[\"Headlines\"].to_list():\n", " composedHeadline = preProcessing(headline)\n", " wordSet = numpy.union1d(wordSet, composedHeadline) #Add Word If Not Present In BoW\n", "end = time.time()\n", "textfile = open(\"Data/wordset.txt\", \"w\")\n", "for element in wordSet:\n", " textfile.write(element + \"\\n\")\n", "textfile.close()\n", "\n", "print(\"Runtime: \" + str(round((end - start), 2)))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "48c1c5ff", "metadata": {}, "outputs": [], "source": [ "#Dimensionality Reduction\n", "\n", "headlineString = \" \".join(headlineData[\"Headlines\"])\n", "headlineString = re.sub(r'[^a-zA-Z]', \" \", headlineString.lower()).split()\n", "print('Pre Pre-Processing Dimensionality: ' + str(len(set(headlineString))))\n", "\n", "wordList = open('Data/wordset.txt', 'r')\n", "wordSet = wordList.read().split()\n", "print('Post Pre-Processing Dimensionality: ' + str(len(wordSet)))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6b91bc65", "metadata": {}, "outputs": [], "source": [ "\"\"\"Feature Extraction\"\"\"\n", "\n", "def featureExtraction(words, document): \n", " wordFrequency = dict.fromkeys(words, 0)\n", " for word in document:\n", " if word in words:\n", " wordFrequency[word] = document.count(word) #Frequency Of Occurrence\n", " return wordFrequency\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d4ec0909", "metadata": {}, "outputs": [], "source": [ "\"\"\"Annotating Training Data\"\"\"\n", "\n", "import random\n", "\n", "#Data Acsess\n", "wordList = open('Data/wordset.txt', 'r')\n", "wordSet = wordList.read().split()\n", "headlineData = pandas.read_csv('Data/all_headlines.csv')\n", "\n", "#Data Frame Construction\n", "annotatedTrainingData = pandas.DataFrame(data={'Document': [], 'Sentiment': []})\n", "trainingFeatures = pandas.DataFrame(data=dict.fromkeys(wordSet, []))\n", "annotatedTrainingData = annotatedTrainingData.append(trainingFeatures, ignore_index=True)\n", "#trainingDataCorpus = pandas.read_csv('Data/training_data.csv')\n", "\n", "#Interactive Annotation Function\n", "def annotateTrainingData():\n", " #Instructions\n", " print(\"Evaluate the overall sentiment of each headline:\")\n", " print(\"If positive enter 'positive', if negative enter 'negative' and if nuetral enter 'nuetral'.\")\n", " print(\"Enter 'END' at any point to stop the process!\")\n", " unprocessedHeadlines = headlineData[\"Headlines\"].to_list()\n", " for headline in unprocessedHeadlines:\n", " headline = namedEntityRecognition(headline)[0]\n", " entry = \"\"\n", " while entry != \"END\":\n", " headlineIndex = random.randint(0, len(unprocessedHeadlines)) #Random headline Selection\n", " headline = unprocessedHeadlines[headlineIndex]\n", " unprocessedHeadlines.remove(headline)\n", " entry = input(headline + \" - Enter the sentiment of this headline: \") #Labeling\n", " if entry != 'END':\n", " #Process Dataset For Machine Learning\n", " sentiment = entry\n", " features = list(featureExtraction(wordSet, preProcessing(headline)).values())\n", " entry = [headline, sentiment]\n", " for feature in features:\n", " entry.append(feature)\n", " #Add Entry To Data Frame\n", " annotatedTrainingData.loc[len(annotatedTrainingData)] = entry\n", " annotatedTrainingData.to_csv('Data/Training/annotated_training_data.csv')\n", " headlineData.drop(headlineIndex, axis=0)\n", " headlineData.to_csv('Data/all_headlines.csv')\n", "\n", "#annotateTrainingData()" ] }, { "cell_type": "code", "execution_count": null, "id": "e901f9e3", "metadata": {}, "outputs": [], "source": [ "\"\"\"External Training Data (Document-Term Matrix)\"\"\"\n", "\n", "#Data Acsess\n", "externalData = pandas.read_csv('Data/Training/external_data.csv', encoding = 'unicode_escape', engine ='python')\n", "externalData = cleanData(externalData)\n", "wordList = open('Data/wordset.txt', 'r')\n", "wordSet = wordList.read().split()\n", "\n", "#Data Frame Construction\n", "externalTrainingData = pandas.DataFrame(data={'Document': [], 'Sentiment': []})\n", "trainingFeatures = pandas.DataFrame(data=dict.fromkeys(wordSet, []))\n", "externalTrainingData = externalTrainingData.append(trainingFeatures, ignore_index=True)\n", "\n", "headlines = externalData[\"Headlines\"].to_list()\n", "sentiment = externalData[\"Sentiment\"].to_list()\n", "\n", "#Process Dataset For Machine Learning\n", "for row in range(1, len(headlines)):\n", " headlines[row] = namedEntityRecognition(headlines[row])[0] \n", " features = list(featureExtraction(wordSet, preProcessing(headlines[row])).values())\n", " entry = [headlines[row], sentiment[row]]\n", " for feature in features:\n", " entry.append(feature)\n", " #Add Entry To Data Frame\n", " externalTrainingData.loc[len(externalTrainingData)] = entry\n", "externalTrainingData.to_csv('Data/Training/external_training_data.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2096f6dd", "metadata": {}, "outputs": [], "source": [ "\"\"\"Combine Training Data\"\"\" #Redundant With No Annotation \n", "\n", "annotatedTrainingData = pandas.read_csv('Data/Training/annotated_training_data.csv')\n", "externalTrainingData = pandas.read_csv('Data/Training/external_training_data.csv')\n", "\n", "labledData = pandas.concat([annotatedTrainingData, externalTrainingData])\n", "labledData = labledData.sample(frac = 1) #Shuffle Data\n", "labledData.to_csv('Data/Training/all_labled_data.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7d7c899e", "metadata": {}, "outputs": [], "source": [ "\"\"\"Feeding\"\"\" \n", "\n", "labledData = pandas.read_csv('Data/Training/all_labled_data.csv')\n", "\n", "#Classifier Training Format\n", "structuredLabledData = []\n", "for index, row in labledData.iterrows():\n", " features = {}\n", " sentiment = row['Sentiment']\n", " for feature in wordSet:\n", " features[feature] = row[feature]\n", " structuredLabledData.append((features, sentiment))\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "f0abc54f", "metadata": {}, "outputs": [], "source": [ "\"\"\"Test-Train Split\"\"\"\n", "\n", "#80-20 Split\n", "onePercent = len(structuredLabledData)//100 \n", "trainingData, testingData = structuredLabledData[onePercent*80:], structuredLabledData[:onePercent*20]\n", "xTrain, yTrain, xTest, yTest = [], [], [], []\n", "for observation in trainingData:\n", " xTrain.append(observation[0])\n", " yTrain.append(observation[1])\n", "for observation in testingData:\n", " xTest.append(observation[0])\n", " yTest.append(observation[1])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "aa7da40a", "metadata": {}, "outputs": [], "source": [ "\"\"\"Standardisation\"\"\"\n", "\n", "from sklearn.preprocessing import StandardScaler \n", "\n", "scaler = StandardScaler()\n", "\n", "numericFeatureList = []\n", "for values in xTrain:\n", " numericFeatures = list(xTrain[0].values())\n", " numericFeatureList.append(numericFeatures)\n", "\n", "#Standardize Data \n", "scaler.fit(numericFeatureList)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8032e1e5", "metadata": {}, "outputs": [], "source": [ "\"\"\"Classifier Testing\"\"\"\n", "\n", "import sklearn\n", "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ConfusionMatrixDisplay\n", "\n", "def evaluationMeasures(classifier, testData):\n", " #Confusion Matrix\n", " testFeatures = [feature for (feature, label) in testData]\n", " testLabels = [label for (feature, label) in testData]\n", " testPredicted = [classifier.classify(feature) for feature in testFeatures]\n", " confusionMatrix = sklearn.metrics.confusion_matrix(testLabels, testPredicted)\n", " display = ConfusionMatrixDisplay(confusion_matrix = confusionMatrix)\n", " #Alternate Metrics\n", " accuracy = accuracy_score(testPredicted, testLabels) #Accuracy\n", " recall = recall_score(testPredicted, testLabels, average=None) #Recall\n", " precision = precision_score(testPredicted, testLabels, average=None) #Precision\n", " f1Score = f1_score(testPredicted, testLabels, average=None) #F1 Score\n", " #Output\n", " display.plot()\n", " print(\" \")\n", " print(\"Accuracy: \" + str(round(accuracy, 2)))\n", " print(\"Recall: \" + str(round(recall[0], 2)))\n", " print(\"Precision: \" + str(round(precision[0], 2)))\n", " print(\"F1 Score: \" + str(round(f1Score[0], 2)))\n", " print(\" \")\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "2eb6962f", "metadata": {}, "outputs": [], "source": [ "\"\"\"Testing Baseline\"\"\"\n", "\n", "from sklearn.dummy import DummyClassifier\n", "\n", "baselineClassifier = DummyClassifier(strategy=\"stratified\")\n", "baselineClassifier.fit(xTrain, yTrain)\n", "print(\" \")\n", "print(\"Stratified Classification Baseline: \")\n", "print(\"Accuracy: \" + str(round(baselineClassifier.score(xTest, yTest), 2)))\n", "print(\" \")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e5a086ea", "metadata": {}, "outputs": [], "source": [ "\"\"\"Naive Bayes Evaluation (No Cross Validation)\"\"\"\n", "\n", "nbClassifier = nltk.NaiveBayesClassifier.train(trainingData)\n", "print(\"Naive Bayes Classifier: \")\n", "evaluationMeasures(nbClassifier, testingData)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5f19c136", "metadata": {}, "outputs": [], "source": [ "\"\"\"Informative Features\"\"\"\n", "\n", "nbClassifier.show_most_informative_features(10)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9ce82bc9", "metadata": {}, "outputs": [], "source": [ "\"\"\"Cross Validation\"\"\"\n", "\n", "import sklearn.model_selection\n", "\n", "def crossValidation(classifier, trainingData):\n", " KFoldCV = sklearn.model_selection.KFold(n_splits=10) #10 Folds\n", " KFoldAccuracy = []\n", " split = 1\n", " for trainIndex, testIndex in KFoldCV.split(trainingData): #Fold Testing\n", " classifier = classifier.train(trainingData[trainIndex[0]:trainIndex[len(trainIndex) - 1]])\n", " split += 1\n", " return classifier\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e4f10342", "metadata": {}, "outputs": [], "source": [ "\"\"\"Cross Validation Impact\"\"\"\n", "\n", "print(\"Evaluation Measures After Cross Validation (Naive Bayes): \")\n", "nbClassifier = crossValidation(nbClassifier, structuredLabledData)\n", "print(\" \")\n", "evaluationMeasures(nbClassifier, testingData)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5bca6957", "metadata": {}, "outputs": [], "source": [ "\"\"\"Support Vector Machine Evaluation\"\"\"\n", "\n", "from sklearn.svm import SVC\n", "from nltk.classify.scikitlearn import SklearnClassifier\n", "\n", "svmClassifier = nltk.classify.SklearnClassifier(SVC())\n", "svmClassifier = crossValidation(svmClassifier, structuredLabledData)\n", "print(\"Support Vector Machine Classifier: \")\n", "evaluationMeasures(svmClassifier, testingData)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "982bdd00", "metadata": { "scrolled": false }, "outputs": [], "source": [ "\"\"\"Logistic Regression Evaluation\"\"\"\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "lrClassifier = SklearnClassifier(LogisticRegression(max_iter=500))\n", "lrClassifier = crossValidation(lrClassifier, structuredLabledData)\n", "print(\"Logistic Regression Classifier: \")\n", "evaluationMeasures(lrClassifier, testingData)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "88baa3ab", "metadata": {}, "outputs": [], "source": [ "\"\"\"Calculate Learning Curve\"\"\"\n", "\n", "from sklearn.model_selection import learning_curve\n", "\n", "svmEstimator = SVC()\n", "\n", "trainSizes, trainScores, testScores, fitTimes, scoreTimes = learning_curve(svmEstimator, numericFeatureList, yTrain, cv=10, return_times=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4d10dda5", "metadata": {}, "outputs": [], "source": [ "\"\"\"Plot Learning Curve\"\"\"\n", "\n", "fig, axs = plt.subplots(1, 3, figsize=(18, 5))\n", "\n", "axs[0].plot(trainSizes, numpy.mean(trainScores, axis=1), 'tab:blue', label = 'Train')\n", "axs[0].plot(trainSizes, numpy.mean(testScores, axis=1), 'tab:red', label = 'Validation')\n", "axs[0].set_xlabel('Experience')\n", "axs[0].set_ylabel('Score')\n", "axs[0].set_title('SVM Learning Curve')\n", "axs[0].legend(loc='lower right')\n", "axs[0].grid()\n", "\n", "axs[1].plot(trainSizes, numpy.mean(fitTimes, axis=1))\n", "axs[1].set_xlabel('Experience')\n", "axs[1].set_ylabel('Fit Time')\n", "axs[1].set_title('SVM Model Scailability')\n", "axs[1].grid()\n", "\n", "axs[2].plot(numpy.mean(fitTimes, axis=1), numpy.mean(testScores, axis=1))\n", "axs[2].set_xlabel('Fit Time')\n", "axs[2].set_ylabel('Score')\n", "axs[2].set_title('SVM Model Performance')\n", "axs[2].grid()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "18418cf7", "metadata": {}, "outputs": [], "source": [ "\"\"\"Optimising Model (Hyper Parameter Tuning - Kernel)\"\"\"\n", "\n", "from sklearn.svm import LinearSVC\n", "\n", "svmClassifierOptamised = nltk.classify.SklearnClassifier(LinearSVC())\n", "svmClassifierOptamised = crossValidation(svmClassifierOptamised, structuredLabledData)\n", "print(\"Support Vector Machine Classifier: \")\n", "evaluationMeasures(svmClassifierOptamised, testingData)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "bab044f4", "metadata": {}, "outputs": [], "source": [ "\"\"\"Final Pipeline\"\"\"\n", "\n", "#Data Acsess\n", "wordList = open('Data/complete-wordset.txt', 'r')\n", "wordSet = wordList.read().split()\n", "\n", "def classificationPipeline(document):\n", " headline = document\n", " document = preProcessing(document) #Pre-Processing\n", " securities = namedEntityRecognition(headline)[1] #NER\n", " features = (featureExtraction(wordSet, document)) #Feature Extraction\n", " sentiment = svmClassifierOptamised.classify(features) #Classification\n", " return [headline, securities, sentiment]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3fc5c680", "metadata": {}, "outputs": [], "source": [ "\"\"\"Sentiment Processing\"\"\"\n", "\n", "#Data Acsess\n", "unclassifiedData = pandas.read_csv('Data/all_headlines.csv')\n", "unclassifiedData = cleanData(unclassifiedData)\n", "\n", "#Data Frame Construction\n", "classifiedData = pandas.DataFrame(data={'Document': [], 'Securities': [], 'Sentiment': [], 'Date': []})\n", "\n", "headlines = unclassifiedData[\"Headlines\"].to_list()\n", "\n", "start = time.time()\n", "for row in range(0, len(headlines)): #Headline Processing\n", " entry = classificationPipeline(headlines[row])\n", " entry.append(pandas.to_datetime(unclassifiedData[\"Date\"][row]))\n", " classifiedData.loc[len(classifiedData)] = entry\n", "classifiedData.to_csv('Data/classified_data.csv')\n", "end = time.time()\n", "\n", "print(\"Runtime: \" + str(round((end - start), 2)) + 's')" ] }, { "cell_type": "code", "execution_count": null, "id": "0af25106", "metadata": {}, "outputs": [], "source": [ "\"\"\"Sentiment Results - Distribution\"\"\"\n", "\n", "#Data Acsess \n", "classifiedData = pandas.read_csv('Data/classified_data.csv')\n", "\n", "#Sentiment Frequency\n", "sentiments = classifiedData[\"Sentiment\"].to_list()\n", "sentimentFrequency = Counter(sentiments)\n", "\n", "fig, axs = plt.subplots(1, 2, figsize=(16, 2))\n", "\n", "#Sentiment Frequency Plot\n", "labels = sentimentFrequency.keys()\n", "frequency = sentimentFrequency.values()\n", "axs[0].bar(labels, frequency, color=['grey', 'red', 'green'])\n", "axs[0].set_xlabel('Sentiment Polarity')\n", "axs[0].set_ylabel('Sentiment Occurance')\n", "axs[0].set_title('Sentiment Occurance From Classification')\n", "\n", "classifiedData = classifiedData[classifiedData['Sentiment'] != 'neutral'] #Objectivity Filtering\n", "sentiments = classifiedData[\"Sentiment\"].to_list()\n", "print(\"Number of Subjective Headlines: \" + str(len(sentiments)))\n", "\n", "#Security Frequency\n", "securities = classifiedData[\"Securities\"].to_list()\n", "securities = ' '.join(securities)\n", "securities = re.sub(r'[^a-zA-Z]', \" \", securities).split()\n", "securitiesFrequency = Counter(securities)\n", "securitiesFrequency.pop('GSPC')\n", "\n", "#Security Frequency Plot\n", "labels = securitiesFrequency.keys()\n", "frequency = securitiesFrequency.values()\n", "axs[1].bar(labels, frequency)\n", "axs[1].set_xlabel('Security')\n", "axs[1].set_ylabel('Security Occurance')\n", "axs[1].set_title('Security Occurance From Classification')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "75cb30b9", "metadata": {}, "outputs": [], "source": [ "\n", "def calculateAverageSentiment(sentiments):\n", " totalSum = 0\n", " for sentiment in sentiments:\n", " if sentiment == 'positive': #Value Assignment\n", " totalSum += 1\n", " average = round(totalSum/len(sentiments), 3) #Average Calculation\n", " return average\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b1ff8ad8", "metadata": {}, "outputs": [], "source": [ "\n", "def normalizeMarketData(marketData):\n", " marketData = (marketData - numpy.min(marketData)) / (numpy.max(marketData) - numpy.min(marketData)) #Normalize\n", " normalizedPriceList = []\n", " for price in marketData: #Cleaning List\n", " normalizedPriceList.append(price)\n", " return normalizedPriceList\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e17a5877", "metadata": {}, "outputs": [], "source": [ "\"\"\"Time Series (Sentiment and Price)\"\"\"\n", "\n", "def timeSeries(security):\n", " marketData = getMarketData(security)\n", " normalizedPriceList = normalizeMarketData(marketData)\n", " monthlyPrice = normalizedPriceList[::21] #Monthly Close Price\n", " #Monthly Sentiment Results for Security\n", " monthlySentimentData = classifiedData.loc[classifiedData['Securities'] == \"['\" + security + \"']\"]['Sentiment'].groupby([pandas.to_datetime(headlineData[\"Date\"]).dt.year, pandas.to_datetime(headlineData[\"Date\"]).dt.month])\n", " missingValues = len(monthlyPrice) - len(monthlySentimentData)\n", " monthlySentiment = []\n", " for sentiment in monthlySentimentData:\n", " sentimentList = sentiment[1].to_list()\n", " sentimentAverage = calculateAverageSentiment(sentimentList)\n", " monthlySentiment.append(sentimentAverage)\n", " insertPlace = len(monthlyPrice) // missingValues\n", " for x in range(0, missingValues):\n", " monthlySentiment.insert(insertPlace, 0.5)\n", " insertPlace += insertPlace\n", " #Months with Insufficient Data\n", " monthlySentiment = monthlySentiment[2:]\n", " monthlyPrice = monthlyPrice[2:]\n", " return monthlySentiment, monthlyPrice\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "d3232d80", "metadata": {}, "outputs": [], "source": [ "\"\"\"Scatter Plot (Sentiment Against Price Change)\"\"\"\n", "\n", "def scatterPlot(security):\n", " marketData = getMarketData(security) #Collect Data\n", " dailyPriceChange = numpy.diff(marketData) #Calculate Change\n", " dailyPriceChange = normalizeMarketData(dailyPriceChange) #Normalize Price Values\n", " #Daily Sentiment Results for Security\n", " dailySentimentData = classifiedData.loc[classifiedData['Securities'] == \"['\" + security + \"']\"]['Sentiment'].groupby(classifiedData[\"Date\"])\n", " #Ignore non-trading days\n", " marketData = marketData.reset_index()\n", " tradingDates = marketData['Date'].to_list()\n", " formattedTradingDates = []\n", " for date in tradingDates:\n", " date = str(date.date())\n", " formattedTradingDates.append(date)\n", " dailySentiment = []\n", " for date in formattedTradingDates:\n", " dailySentiment.append('NA')\n", " for sentiment in dailySentimentData:\n", " if sentiment[0] in formattedTradingDates:\n", " sentimentList = sentiment[1].to_list()\n", " sentimentAverage = calculateAverageSentiment(sentimentList)\n", " dailySentiment[formattedTradingDates.index(sentiment[0])] = sentimentAverage\n", " dailySentimentValues, dailyPriceChangeValues = [], []\n", " for x in range(0, len(dailySentiment) - 1):\n", " if dailySentiment[x] != 'NA':\n", " dailySentimentValues.append(dailySentiment[x])\n", " dailyPriceChangeValues.append(dailyPriceChange[x])\n", " return dailySentimentValues, dailyPriceChangeValues\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "3fd58d77", "metadata": {}, "outputs": [], "source": [ "\"\"\"Visualize Results\"\"\"\n", "\n", "from scipy import stats\n", "\n", "def securityResults(security):\n", " fig, axs = plt.subplots(1, 2, figsize=(16, 2))\n", " #Plot Time Series \n", " reflectGradient, intercept, rValue, pValue, stdError = stats.linregress(timeSeries(security)[0][:28], timeSeries(security)[1][:28])\n", " axs[0].plot(timeSeries(security)[0], 'tab:blue', label='Headline Sentiment')\n", " axs[0].plot(timeSeries(security)[1], 'tab:red', label= str(security + ' Price'))\n", " axs[0].set_title('Monthly ' + security + ' Headline Sentiment and Market Price')\n", " axs[0].set_xlabel('(Febuary 2018 - July 2020)')\n", " axs[0].legend(loc='upper left')\n", " #Plot Scatter Graph Change\n", " axs[1].scatter(scatterPlot(security)[0], scatterPlot(security)[1], c='black', marker='x')\n", " directGradient, intercept, rValue, pValue, stdError = stats.linregress(scatterPlot(security)[0], scatterPlot(security)[1])\n", " x1 = numpy.linspace(numpy.min(scatterPlot(security)[0]), numpy.max(scatterPlot(security)[0]) ,500)\n", " y1 = directGradient * x1 + intercept\n", " axs[1].plot(x1, y1,'-r')\n", " axs[1].set_title('Daily ' + security + ' Headline Sentiment Against Market Price Change')\n", " axs[1].set_xlabel('Daily ' + security + ' Price Change (USD)')\n", " axs[1].set_ylabel('Daily Sentiment Change')\n", " #Statistical Analysis\n", " directionStatistics = stats.ttest_ind(scatterPlot(security)[0], scatterPlot(security)[1])\n", " reflectionStatistics = stats.ttest_ind(timeSeries(security)[0][:28], timeSeries(security)[1][:28])\n", " print('Direction Correlation: ' + str(round(directGradient, 2)))\n", " print('Direction T-Value: ' + str(round(directionStatistics[0], 2)))\n", " print('Direction P-Value: ' + str(round(directionStatistics[1], 2)))\n", " print('Reflection Correlation: ' + str(round(reflectGradient, 2)))\n", " print('Reflection T-Value: ' + str(round(reflectionStatistics[0], 2)))\n", " print('Reflection P-Value: ' + str(round(reflectionStatistics[1], 2)))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "27cc9df2", "metadata": {}, "outputs": [], "source": [ "\"\"\"U.S. Market Results\"\"\"\n", "\n", "securityResults('^GSPC')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3db292c3", "metadata": {}, "outputs": [], "source": [ "\"\"\"Apple Results\"\"\"\n", "\n", "securityResults('AAPL')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6018fb26", "metadata": {}, "outputs": [], "source": [ "\"\"\"Google Results\"\"\"\n", "\n", "securityResults('WMT')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7a2fdbb4", "metadata": {}, "outputs": [], "source": [ "\"\"\"Amazon Results\"\"\"\n", "\n", "securityResults('AMZN')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "71df6844", "metadata": {}, "outputs": [], "source": [ "\"\"\"Google Results\"\"\"\n", "\n", "securityResults('GOOGL')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "97bc46b6", "metadata": {}, "outputs": [], "source": [ "\"\"\"Microsoft Results\"\"\"\n", "\n", "securityResults('MSFT')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ef98ac80", "metadata": {}, "outputs": [], "source": [ "\n", "\"\"\"Author: Harvey Allen (1926159)\"\"\"\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }