{ "cells": [ { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: PYTHONHASHSEED=1\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import time\n", "import datetime\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.metrics import roc_auc_score, accuracy_score\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "from sklearn.neural_network import MLPClassifier\n", "from scipy.stats import ttest_ind\n", "\n", "%set_env PYTHONHASHSEED=1\n", "# %matplotlib inline\n", "import itertools\n", "import tqdm as tqdm\n", "\n", "from gensim.test.utils import common_texts, get_tmpfile\n", "from gensim.models import Word2Vec, Doc2Vec, LdaModel, TfidfModel\n", "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", "from gensim.test.utils import common_corpus, common_dictionary\n", "from gensim.parsing.preprocessing import strip_punctuation, strip_short, strip_non_alphanum, strip_tags, strip_multiple_whitespaces, remove_stopwords\n", "from gensim.parsing.preprocessing import preprocess_string\n", "from gensim.corpora import Dictionary\n", "from gensim.parsing.preprocessing import STOPWORDS\n", "from gensim.matutils import corpus2dense" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "today = re.sub(string=str(datetime.datetime.today()), pattern=\"\\W\", repl=\"\")" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'20190528085750160508'" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "today" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14897, 34)" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata = pd.read_json('full.json')\n", "rawDdata.shape" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "rawDdata['Start Date'] = pd.to_datetime(rawDdata['Start Date'],unit='ms')\n", "rawDdata['End Date'] = pd.to_datetime(rawDdata['End Date'],unit='ms')\n", "rawDdata.to_csv(\"fiddlingRaw.csv\")" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14604, 34)" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata = rawDdata[rawDdata['segCode']==0]\n", "rawDdata.shape" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "rawDdata = rawDdata.sort_index()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Id 39359102\n", "Start Date 2018-02-28 00:00:00\n", "Start Time 16:59:00\n", "End Date 2018-03-06 00:00:00\n", "End Time NULL\n", "Duration NULL\n", "Entered by Jason Coleman\n", "Notes NULL\n", "Question Trouble locating QL49 .J4453\n", "Answer 4:59 25870693270869823292870615 transfer from ...\n", "Notes FULL NULL\n", "Library Dept./Branch/Service Hale Library Help\n", "Where were you? Service Desk\n", "Who answers jmc\n", "Who Asked? KSU undergrad\n", "How many in the group? NULL\n", "Question Format im\n", "Question Type Directional\n", "Referred to? NULL\n", "READ 2\n", "Time Spent 6-10 min\n", "Class/Discipline NULL\n", "tags NULL\n", "Room reservation NULL\n", "Reported to: NULL\n", "READ_1_vs_2 1\n", "READ_2_vs_3 0\n", "Transcript 4:59 25870693270869823292870615 transfer from ...\n", "TransLength 1178\n", "segTrans [[4:59, patron, Hi I can't find my book], [4:5...\n", "segCode 0\n", "PatronTextString Hi I can't find my book I can't find the QL I ...\n", "StaffTextString Ok. Which one are you looking for? Are you in ...\n", "AllTextString Hi I can't find my book Ok. Which one are you ...\n", "Name: 13751, dtype: object" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata.loc[13751,]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "rawDdata = rawDdata.sort_values(by='Start Date')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "rawDdata.head(10)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32.71 : Splitting Patron Lines\n" ] } ], "source": [ "def getPatronSections(df,breaks):\n", " st = time.time()\n", " \n", " df = df.copy()\n", " corpus = df['PatronTextString'].str.split(pat=\"\\s{1,}\").copy()\n", " \n", " for i in corpus.index:\n", " \n", " df.loc[i,'First5'] = \" \".join(corpus.loc[i][:breaks[0]])\n", " df.loc[i,'First10'] = \" \".join(corpus.loc[i][:breaks[1]])\n", " df.loc[i,'First20'] = \" \".join(corpus.loc[i][:breaks[2]])\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Splitting Patron Lines'.format(et)) \n", " return(df) \n", "\n", "patronSegmentsOptions = [5,10,20]\n", "rawDdata = getPatronSections(df=rawDdata,breaks=patronSegmentsOptions)" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Id 39359102\n", "Start Date 2018-02-28 00:00:00\n", "Start Time 16:59:00\n", "End Date 2018-03-06 00:00:00\n", "End Time NULL\n", "Duration NULL\n", "Entered by Jason Coleman\n", "Notes NULL\n", "Question Trouble locating QL49 .J4453\n", "Answer 4:59 25870693270869823292870615 transfer from ...\n", "Notes FULL NULL\n", "Library Dept./Branch/Service Hale Library Help\n", "Where were you? Service Desk\n", "Who answers jmc\n", "Who Asked? KSU undergrad\n", "How many in the group? NULL\n", "Question Format im\n", "Question Type Directional\n", "Referred to? NULL\n", "READ 2\n", "Time Spent 6-10 min\n", "Class/Discipline NULL\n", "tags NULL\n", "Room reservation NULL\n", "Reported to: NULL\n", "READ_1_vs_2 1\n", "READ_2_vs_3 0\n", "Transcript 4:59 25870693270869823292870615 transfer from ...\n", "TransLength 1178\n", "segTrans [[4:59, patron, Hi I can't find my book], [4:5...\n", "segCode 0\n", "PatronTextString Hi I can't find my book I can't find the QL I ...\n", "StaffTextString Ok. Which one are you looking for? Are you in ...\n", "AllTextString Hi I can't find my book Ok. Which one are you ...\n", "First5 Hi I can't find my\n", "First10 Hi I can't find my book I can't find the\n", "First20 Hi I can't find my book I can't find the QL I ...\n", "Name: 13751, dtype: object" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata.loc[13751,]" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Id int64\n", "Start Date datetime64[ns]\n", "Start Time object\n", "End Date datetime64[ns]\n", "End Time object\n", "Duration object\n", "Entered by object\n", "Notes object\n", "Question object\n", "Answer object\n", "Notes FULL object\n", "Library Dept./Branch/Service object\n", "Where were you? object\n", "Who answers object\n", "Who Asked? object\n", "How many in the group? object\n", "Question Format object\n", "Question Type object\n", "Referred to? object\n", "READ object\n", "Time Spent object\n", "Class/Discipline object\n", "tags object\n", "Room reservation object\n", "Reported to: object\n", "READ_1_vs_2 int64\n", "READ_2_vs_3 int64\n", "Transcript object\n", "TransLength int64\n", "segTrans object\n", "segCode int64\n", "PatronTextString object\n", "StaffTextString object\n", "AllTextString object\n", "First5 object\n", "First10 object\n", "First20 object\n", "dtype: object" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata.dtypes" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Id', 'Start Date', 'Start Time', 'End Date', 'End Time', 'Duration',\n", " 'Entered by', 'Notes', 'Question', 'Answer', 'Notes FULL',\n", " 'Library Dept./Branch/Service', 'Where were you?', 'Who answers',\n", " 'Who Asked?', 'How many in the group?', 'Question Format',\n", " 'Question Type', 'Referred to?', 'READ', 'Time Spent',\n", " 'Class/Discipline', 'tags', 'Room reservation', 'Reported to:',\n", " 'READ_1_vs_2', 'READ_2_vs_3', 'Transcript', 'TransLength', 'segTrans',\n", " 'segCode', 'PatronTextString', 'StaffTextString', 'AllTextString',\n", " 'First5', 'First10', 'First20'],\n", " dtype='object')" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rawDdata.columns" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "manualTags = [\n", " ['tagURL',[\n", " re.escape('amazon.com'),\n", " re.escape('newfirstsearch'),\n", " re.escape('galegroup'),\n", " re.escape('ingentaconnect.com'),\n", " re.escape('proquest.com'),\n", " re.escape('ncbi.nlm.nih.gov'),\n", " re.escape('sciencedirect.com'),\n", " re.escape('springer.com'),\n", " re.escape('tandfonline.com'),\n", " re.escape('webofknowledge'),\n", " re.escape('wiley.com'),\n", " re.escape('books.google'),\n", " re.escape('google.com'),\n", "\n", " re.escape('apps.lib.k-state.edu/databases'),\n", "\n", " re.escape('er.lib.ksu.edu'),\n", " re.escape('er.lib.k-state.edu'),\n", "\n", " re.escape('getit.lib.ksu.edu'),\n", " re.escape('getit.lib.k-state.edu'),\n", "\n", " re.escape('guides.lib.ksu.edu'),\n", " re.escape('guides.lib.k-state.edu'),\n", "\n", " re.escape('catalog.lib.ksu.edu'),\n", " re.escape('catalog2.lib.ksu.edu'),\n", " re.escape('catalog.lib.k-state.edu'),\n", " re.escape('catalog2.lib.k-state.edu'),\n", "\n", " re.escape('primo.hosted.exlibrisgroup.com'),\n", " re.escape('na02.alma.exlibrisgroup'),\n", "\n", " re.escape('searchit.lib.ksu.edu'),\n", " re.escape('searchit.lib.k-state.edu'),\n", "\n", " re.escape('lib.k-state.edu'),\n", " re.escape('lib.k-state.edu'),\n", "\n", " re.escape('doi.org'),\n", "\n", " re.escape('http'),\n", " re.escape('www.'),]\n", " ],\n", " \n", " ['tagPRINTING',[\n", " 'color print',\n", " 'colored print',\n", " 'print in color',\n", " 'print something in color',\n", " \"\\Win color\\W\",\n", " \"cat cash\",\n", " 'printer',\n", " '(? 0:\n", " listIndex.append(int(i))\n", " else:\n", " notIndex.append(int(i))\n", " \n", " Xlist = representation.loc[listIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist) \n", " \n", " RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n", " df.loc[listIndex,'PredictProbList'] = y_prob_list\n", " df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n", "\n", " \n", " \n", " Xlist = representation.loc[notIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist)\n", " \n", " RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n", " df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n", " df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list\n", " \n", " \n", " \n", " # DEFAULT STATE\n", " else:\n", " RocAucScore_list = RocAucScore\n", " RocAucScore_NOTlist = RocAucScore\n", " df.loc[labels.index,'PredictProbList'] = yprob\n", " df.loc[labels.index,'PredictPredList'] = yprob_pred\n", " \n", " \n", " df.loc[labels.index,'PredictProb'] = yprob\n", " df.loc[labels.index,'PredictPred'] = yprob_pred\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Running MLP Fit and Eval'.format(et))\n", " return(neural_model,RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "def testModel(df,targetLabels,modelType,dictionary,neural_model,collectTags,ldajoincolumns,tagState):\n", " st = time.time()\n", " \n", " df = df.copy()\n", " \n", " labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()\n", " df_test = df.loc[labels.index,].copy()\n", " \n", " if modelType == 'lda':\n", " tokenizedTexts = list(df_test['tokenizedTexts'])\n", " tokenizedList_bow = [dictionary.doc2bow(x) for x in tokenizedTexts]\n", " \n", " ldaDocMatrix = [i for i in ldaModel.get_document_topics(tokenizedList_bow)]\n", " representation = pd.DataFrame(corpus2dense(ldaDocMatrix, num_terms=ldaModel.num_topics).transpose(), index=df_test.index)\n", " \n", " \n", " if collectTags:\n", " counter = CountVectorizer(vocabulary=ldajoincolumns,binary=True)\n", " tagstrings = df_test['manualTags'].str.join(\" \")\n", " tagDF = pd.DataFrame(counter.fit_transform(tagstrings).todense(),\n", " columns=ldajoincolumns,\n", " index=df_test.index)\n", " representation = representation.merge(tagDF,left_index=True,right_index=True,suffixes=(False,False))\n", " \n", " \n", " if modelType == 'd2v':\n", " indic = []\n", " dat = []\n", " for i in df_test.index:\n", "# print(i)\n", " indic.append(i)\n", "# dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']+df_test.loc[i,'manualTags']), steps=100))\n", " dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']), steps=100))\n", "\n", " representation = pd.DataFrame(dat,index=indic)\n", " \n", " X = representation\n", " yTest_prob = neural_model.predict_proba(X)[:,1]\n", " yTest_pred = neural_model.predict(X)\n", " RocAucScore = roc_auc_score(y_true=labels,y_score=yTest_prob)\n", " \n", " listIndex = []\n", " notIndex = []\n", " if tagState:\n", " for i in df.loc[labels.index,].index:\n", " if len(df.loc[i,'manualTags']) > 0:\n", " listIndex.append(int(i))\n", " else:\n", " notIndex.append(int(i))\n", " \n", " Xlist = representation.loc[listIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist) \n", " \n", " RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n", " df.loc[listIndex,'PredictProbList'] = y_prob_list\n", " df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n", "\n", " \n", " \n", " Xlist = representation.loc[notIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist)\n", " \n", " RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n", " df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n", " df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list\n", " \n", " \n", " \n", " # DEFAULT STATE\n", " else:\n", " RocAucScore_list = RocAucScore\n", " RocAucScore_NOTlist = RocAucScore\n", " df.loc[labels.index,'PredictProbList'] = yTest_prob\n", " df.loc[labels.index,'PredictPredList'] = yTest_pred\n", " \n", " \n", " df.loc[labels.index,'PredictProb'] = yTest_prob\n", " df.loc[labels.index,'PredictPred'] = yTest_pred\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Testing Model with Holdout Data'.format(et))\n", " return(RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)\n", " " ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "def prepareTestingData(df,vocab,section):\n", " st = time.time()\n", " \n", " df = df.copy() \n", " df['DocTags'] = [[] for i in range(df.shape[0])] \n", " \n", " \n", " splitStrings = df.loc[:,section].str.lower()\n", " splitStrings = splitStrings.str.split(\"\\W\")\n", " \n", " testTokens = pd.Series()\n", " for i in splitStrings.index:\n", " j = []\n", " for m in splitStrings.loc[i]:\n", " if m in vocab:\n", " j.append(m)\n", "\n", " testTokens.loc[i] = j\n", " df.loc[testTokens.index,'tokenizedTexts'] = testTokens\n", " \n", " \n", " et = time.time() - st\n", " print('{:.2f} : Preparing Test Holdout Data'.format(et))\n", " return(df)" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [], "source": [ "def getFocusedROCAUC(df_test,df_train,qtype,labelledsection,tagState):\n", " st = time.time()\n", " df_test = df_test.copy()\n", " df_train = df_train.copy()\n", " \n", " df_train = df_train.loc[df_train['READ']!='Unknown',]\n", " ytrue = df_train.loc[df_train['Question Type']==qtype,labelledSection]\n", " ypred = df_train.loc[df_train['Question Type']==qtype,'PredictProb']\n", " trainRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)\n", " \n", " df_test = df_test.loc[df_test['READ']!='Unknown',]\n", " ytrue = df_test.loc[df_test['Question Type']==qtype,labelledSection]\n", " ypred = df_test.loc[df_test['Question Type']==qtype,'PredictProb']\n", " testRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)\n", " \n", " if tagState:\n", " trainList = []\n", " for i in df_train.index:\n", " if len(df_train.loc[i,'manualTags']) > 0:\n", " trainList.append(i)\n", " testList = []\n", " for i in df_test.index:\n", " if len(df_test.loc[i,'manualTags']) > 0:\n", " trainList.append(i)\n", " \n", "# trainList = list(set(consolidatedList).intersection(set(df_train.index)))\n", "# testList = list(set(consolidatedList).intersection(set(df_test.index)))\n", "\n", " df_train_list = df_train.loc[trainList,].copy()\n", " ytrue_list = df_train_list.loc[df_train_list['Question Type']==qtype,labelledSection]\n", " ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictProbList']\n", " try:\n", " trainRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_score=ypred_list)\n", " except:\n", " ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictPred']\n", " trainRocAucScore_list = \"Acc_\" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))\n", "\n", "\n", " df_test_list = df_test.loc[testList,].copy()\n", " ytrue_list = df_test_list.loc[df_test_list['Question Type']==qtype,labelledSection]\n", " ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictProbList']\n", " try:\n", " testRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_pred=ypred_list)\n", " except:\n", " ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictPredList']\n", " testRocAucScore_list = \"Acc_\" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))\n", " trainListShape = len(trainList)\n", " testListShape = len(testList)\n", "\n", " \n", " else:\n", " trainRocAucScore_list = None\n", " testRocAucScore_list = None\n", " trainListShape = 0\n", " testListShape = 0\n", " \n", " \n", " et = time.time() - st\n", " print('{:.2f} : Getting Focused ROC-AUC Scores ({})'.format(et,qtype))\n", " return(trainRocAucScore,testRocAucScore,trainRocAucScore_list,testRocAucScore_list,trainListShape,testListShape)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "# custom vocab builder\n", "customVocab = []\n", "\n", "# random states\n", "randomOptions = list(np.arange(0,20,1))\n", "# randomOptions = [0]\n", "\n", "#test Split\n", "\n", "testingSplits = [\n", " 2000,\n", "]\n", "\n", "\n", "manualTagOptions = [\n", " True,\n", " False,\n", "]\n", "rollupsOptions = [\n", " True,\n", " False,\n", "]\n", "\n", "\n", "#tfidf limits\n", "dictOptions = [ \n", " (False, 1, 300000),\n", " (True, 2, 3000),\n", "]\n", "\n", "#which patron section to look at\n", "patronSectionOptions = [\n", " 'First10',\n", " 'First20',\n", "]\n", "\n", "#labels\n", "labelledSectionOptions = [\n", " 'READ_1_vs_2',\n", " 'READ_2_vs_3',\n", "]\n", "\n", "#model\n", "modelOptions = [\n", " 'lda',\n", " 'd2v',\n", "]\n", "\n", "\n", "optionsList = [\n", " randomOptions,\n", " manualTagOptions,\n", " rollupsOptions,\n", " dictOptions,\n", " patronSectionOptions,\n", " labelledSectionOptions,\n", " modelOptions,\n", "]\n", "\n", "optionsLen = [len(x) for x in optionsList]\n", "\n", "testtotal = np.prod(optionsLen)\n", "iterationcounter = 0\n", "\n", "parameterList = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "for z in tqdm.tqdm(randomOptions):\n", " randomSeed=z\n", "\n", "\n", " for s in testingSplits:\n", " split = s\n", " testsplit = rawDdata.shape[0]-split\n", " trainData = rawDdata[:testsplit]\n", " testData = rawDdata[testsplit:]\n", " \n", " trainShape = trainData.shape[0]\n", " testShape = testData.shape[0]\n", "\n", " for e in patronSectionOptions:\n", " patronSection=e \n", "\n", " for f in manualTagOptions:\n", " getTags = f\n", "\n", " for r in rollupsOptions:\n", " if getTags == True:\n", " rollup = r\n", " elif getTags == False:\n", " if r == True:\n", " continue\n", " else:\n", " rollup = r\n", " trainDataTagged = getManualTags(df=trainData,\n", " collectTags=getTags,\n", " manualTagsList=manualTags,\n", " section=patronSection,\n", " rollups=rollup,\n", " )\n", " testDataTagged = getManualTags(df=testData,\n", " collectTags=getTags,\n", " manualTagsList=manualTags,\n", " section=patronSection,\n", " rollups=rollup,\n", " )\n", "\n", " for g in labelledSectionOptions:\n", " labelledSection=g\n", "\n", " for h in modelOptions:\n", " model=h \n", "\n", " for m in dictOptions:\n", " truncate=m \n", "\n", " start_time = time.time()\n", "\n", " trainDataTRUNC,countDF,countList,tfidfList,remove,vocab,gendict = getTFIDFlimited(\n", " df=trainDataTagged,\n", " truncate=truncate,\n", " section=patronSection,\n", " controlVocab=customVocab,\n", " model=model,\n", " )\n", " \n", "\n", " if model == 'lda':\n", " ldaModel, representation, ldacolumntags = getLDAmatrix(\n", " df=trainDataTRUNC,\n", " dictionary=gendict,\n", " collectTags=getTags,\n", " random=randomSeed)\n", "\n", " if model == 'd2v':\n", " d2vModel, representation = doc2vecModel(\n", " df=trainDataTRUNC,\n", " random=randomSeed) \n", " ldacolumntags=None\n", "\n", "\n", "\n", " MLPmodel,AUCscore,AUCscorelist,AUCscoreNOTlist,trainLlen,trainNLlen,trainDataFinal,trainFinalRep = trainModel(\n", " df=trainDataTRUNC,\n", " reps=representation,\n", " targetLabels=labelledSection,\n", " random=randomSeed,\n", " tagState=getTags,)\n", "\n", "\n", " testDataPrepped = prepareTestingData(\n", " df=testDataTagged,\n", " vocab=vocab,\n", " section=patronSection,)\n", "\n", "\n", " testAUC,testAUClist,testAUCNOTlist,testLlen,testNLlen,testDataFinal,testFinalRep = testModel(\n", " df=testDataPrepped,\n", " targetLabels=labelledSection,\n", " modelType=model,\n", " dictionary=gendict,\n", " neural_model=MLPmodel,\n", " ldajoincolumns=ldacolumntags,\n", " collectTags=getTags,\n", " tagState=getTags,)\n", " \n", "\n", "\n", " end_time = time.time()\n", " total_time = end_time-start_time\n", "\n", " modelParameters = {\n", " \n", " 'TRUNC':patronSection,\n", " 'O-Core':getTags,\n", " 'O-Core+Super':rollup,\n", " 'DICT':str(truncate),\n", " 'REPRESENT':model,\n", " 'READ':labelledSection,\n", " 'AUC_train':AUCscore,\n", " 'AUC_test':testAUC,\n", "\n", " 'AUC_train_Onto':AUCscorelist,\n", " 'AUC_test_Onto':testAUClist,\n", " 'AUC_train_Not_Onto':AUCscoreNOTlist,\n", " 'AUC_test_Not_Onto':testAUCNOTlist,\n", "\n", " 'AUC_train_Onto_LEN':trainLlen,\n", " 'AUC_train_OntoN_LEN':trainNLlen,\n", " 'AUC_test_Onto_LEN':testLlen,\n", " 'AUC_test_OntoN_LEN':testNLlen,\n", " \n", " \n", " 'CycleTime':total_time,\n", " 'RAND':randomSeed,\n", " 'Custom Vocab Len':len(customVocab),\n", " 'Full Vocab Len':len(vocab),\n", " 'Testing Split':testsplit,\n", " 'Training Data Full':trainShape,\n", " 'Testing Data Full':testShape,\n", " 'Training Data Final':trainDataFinal.shape[0],\n", " 'Testing Data Final':testDataFinal.shape[0],\n", " }\n", "\n", " parameterList.append(list(modelParameters.values()))\n", " print(modelParameters.values())\n", " iterationcounter += 1\n", " print(\"{} out of {} complete\".format(iterationcounter,testtotal))\n", "\n", "parameterDataFrame = pd.DataFrame(parameterList,columns=list(modelParameters.keys()))" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TRUNCO-CoreO-Core+SuperDICTREPRESENTREADAUC_trainAUC_testAUC_train_OntoAUC_test_Onto...AUC_test_OntoN_LENCycleTimeRANDCustom Vocab LenFull Vocab LenTesting SplitTraining Data FullTesting Data FullTraining Data FinalTesting Data Final
0First10TrueTrue(False, 1, 300000)ldaREAD_1_vs_20.7126270.6587890.8169800.729856...141336.25290500742212604126042000101621753
1First10TrueTrue(True, 2, 3000)ldaREAD_1_vs_20.7062740.6627810.8138950.731202...141328.37601700296712604126042000101621753
2First10TrueTrue(False, 1, 300000)d2vREAD_1_vs_20.7260600.6811490.7986950.726875...141331.99005700742212604126042000101621753
3First10TrueTrue(True, 2, 3000)d2vREAD_1_vs_20.7099610.6705470.7917960.730337...141325.85572000296712604126042000101621753
4First10TrueTrue(False, 1, 300000)ldaREAD_2_vs_30.7032180.6442130.8242650.768131...141335.52799300742212604126042000101621753
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " TRUNC O-Core O-Core+Super DICT REPRESENT READ \\\n", "0 First10 True True (False, 1, 300000) lda READ_1_vs_2 \n", "1 First10 True True (True, 2, 3000) lda READ_1_vs_2 \n", "2 First10 True True (False, 1, 300000) d2v READ_1_vs_2 \n", "3 First10 True True (True, 2, 3000) d2v READ_1_vs_2 \n", "4 First10 True True (False, 1, 300000) lda READ_2_vs_3 \n", "\n", " AUC_train AUC_test AUC_train_Onto AUC_test_Onto ... \\\n", "0 0.712627 0.658789 0.816980 0.729856 ... \n", "1 0.706274 0.662781 0.813895 0.731202 ... \n", "2 0.726060 0.681149 0.798695 0.726875 ... \n", "3 0.709961 0.670547 0.791796 0.730337 ... \n", "4 0.703218 0.644213 0.824265 0.768131 ... \n", "\n", " AUC_test_OntoN_LEN CycleTime RAND Custom Vocab Len Full Vocab Len \\\n", "0 1413 36.252905 0 0 7422 \n", "1 1413 28.376017 0 0 2967 \n", "2 1413 31.990057 0 0 7422 \n", "3 1413 25.855720 0 0 2967 \n", "4 1413 35.527993 0 0 7422 \n", "\n", " Testing Split Training Data Full Testing Data Full Training Data Final \\\n", "0 12604 12604 2000 10162 \n", "1 12604 12604 2000 10162 \n", "2 12604 12604 2000 10162 \n", "3 12604 12604 2000 10162 \n", "4 12604 12604 2000 10162 \n", "\n", " Testing Data Final \n", "0 1753 \n", "1 1753 \n", "2 1753 \n", "3 1753 \n", "4 1753 \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parameterDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n", " 'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n", " 'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n", " 'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n", " 'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n", " 'Testing Split', 'Training Data Full', 'Testing Data Full',\n", " 'Training Data Final', 'Testing Data Final'],\n", " dtype='object')" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parameterDataFrame.columns" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TRUNCO-CoreO-Core+SuperDICTREPRESENTREADAUC_trainAUC_testAUC_train_OntoAUC_test_Onto...AUC_test_OntoN_LENCycleTimeRANDCustom Vocab LenFull Vocab LenTesting SplitTraining Data FullTesting Data FullTraining Data FinalTesting Data Final
0First10TrueTrue(False, 1, 300000)ldaREAD_1_vs_20.7126270.6587890.8169800.729856...141336.25290500742212604126042000101621753
1First10TrueTrue(True, 2, 3000)ldaREAD_1_vs_20.7062740.6627810.8138950.731202...141328.37601700296712604126042000101621753
2First10TrueTrue(False, 1, 300000)d2vREAD_1_vs_20.7260600.6811490.7986950.726875...141331.99005700742212604126042000101621753
3First10TrueTrue(True, 2, 3000)d2vREAD_1_vs_20.7099610.6705470.7917960.730337...141325.85572000296712604126042000101621753
4First10TrueTrue(False, 1, 300000)ldaREAD_2_vs_30.7032180.6442130.8242650.768131...141335.52799300742212604126042000101621753
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " TRUNC O-Core O-Core+Super DICT REPRESENT READ \\\n", "0 First10 True True (False, 1, 300000) lda READ_1_vs_2 \n", "1 First10 True True (True, 2, 3000) lda READ_1_vs_2 \n", "2 First10 True True (False, 1, 300000) d2v READ_1_vs_2 \n", "3 First10 True True (True, 2, 3000) d2v READ_1_vs_2 \n", "4 First10 True True (False, 1, 300000) lda READ_2_vs_3 \n", "\n", " AUC_train AUC_test AUC_train_Onto AUC_test_Onto ... \\\n", "0 0.712627 0.658789 0.816980 0.729856 ... \n", "1 0.706274 0.662781 0.813895 0.731202 ... \n", "2 0.726060 0.681149 0.798695 0.726875 ... \n", "3 0.709961 0.670547 0.791796 0.730337 ... \n", "4 0.703218 0.644213 0.824265 0.768131 ... \n", "\n", " AUC_test_OntoN_LEN CycleTime RAND Custom Vocab Len Full Vocab Len \\\n", "0 1413 36.252905 0 0 7422 \n", "1 1413 28.376017 0 0 2967 \n", "2 1413 31.990057 0 0 7422 \n", "3 1413 25.855720 0 0 2967 \n", "4 1413 35.527993 0 0 7422 \n", "\n", " Testing Split Training Data Full Testing Data Full Training Data Final \\\n", "0 12604 12604 2000 10162 \n", "1 12604 12604 2000 10162 \n", "2 12604 12604 2000 10162 \n", "3 12604 12604 2000 10162 \n", "4 12604 12604 2000 10162 \n", "\n", " Testing Data Final \n", "0 1753 \n", "1 1753 \n", "2 1753 \n", "3 1753 \n", "4 1753 \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parameterDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AUC_train_Not_OntoAUC_test_Not_OntoAUC_train_Onto_LENAUC_train_OntoN_LENAUC_test_Onto_LENAUC_test_OntoN_LENCycleTimeRANDCustom Vocab LenFull Vocab Len
00.6807200.63751018328330340141336.252905007422
10.6731450.64425318328330340141328.376017002967
20.7040600.66598118328330340141331.990057007422
30.6847040.65206918328330340141325.855720002967
40.6705930.60220618328330340141335.527993007422
\n", "
" ], "text/plain": [ " AUC_train_Not_Onto AUC_test_Not_Onto AUC_train_Onto_LEN \\\n", "0 0.680720 0.637510 1832 \n", "1 0.673145 0.644253 1832 \n", "2 0.704060 0.665981 1832 \n", "3 0.684704 0.652069 1832 \n", "4 0.670593 0.602206 1832 \n", "\n", " AUC_train_OntoN_LEN AUC_test_Onto_LEN AUC_test_OntoN_LEN CycleTime \\\n", "0 8330 340 1413 36.252905 \n", "1 8330 340 1413 28.376017 \n", "2 8330 340 1413 31.990057 \n", "3 8330 340 1413 25.855720 \n", "4 8330 340 1413 35.527993 \n", "\n", " RAND Custom Vocab Len Full Vocab Len \n", "0 0 0 7422 \n", "1 0 0 2967 \n", "2 0 0 7422 \n", "3 0 0 2967 \n", "4 0 0 7422 " ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parameterDataFrame.iloc[:,10:20].head()" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "parameterDataFrame.to_csv('{}_Run.csv'.format(today).format(today))" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "preservedDataFrame = pd.read_csv('{}_Run.csv'.format(today),index_col=0)\n", "preservedDataFrame.to_csv('{}_preserveRun.csv'.format(today))" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [], "source": [ "preservedDataFrame = pd.read_csv('{}_preserveRun.csv'.format(today),index_col=0)" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(96, 25)" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n", " 'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n", " 'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n", " 'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n", " 'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n", " 'Testing Split', 'Training Data Full', 'Testing Data Full',\n", " 'Training Data Final', 'Testing Data Final'],\n", " dtype='object')" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " " ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [], "source": [ "preservedDataFrame['O-Core'] = np.where(preservedDataFrame['O-Core'], 1, 0)\n", "preservedDataFrame['O-Core+Super'] = np.where(preservedDataFrame['O-Core+Super'], 1, 0)\n", "preservedDataFrame['TRUNC'] = np.where(preservedDataFrame['TRUNC'] == 'First20', 2,\n", " np.where(preservedDataFrame['TRUNC'] == 'First10', 1, 0))\n", "preservedDataFrame['DICT'] = np.where(preservedDataFrame['DICT'] == \"\"\"(True, 2, 3000)\"\"\", 1, 0)\n", "\n", "preservedDataFrame['READ'] = np.where(preservedDataFrame['READ'] == 'READ_1_vs_2', 0, 1)\n", "preservedDataFrame['REPRESENT'] = np.where(preservedDataFrame['REPRESENT'] == 'lda', 0, 1)" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n", " 'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n", " 'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n", " 'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n", " 'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n", " 'Testing Split', 'Training Data Full', 'Testing Data Full',\n", " 'Training Data Final', 'Testing Data Final'],\n", " dtype='object')" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.columns" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", " 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,\n", " 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,\n", " 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,\n", " 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,\n", " 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95],\n", " dtype='int64')" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.index" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TRUNCO-CoreO-Core+SuperDICTREPRESENTREADAUC_trainAUC_testAUC_train_OntoAUC_test_Onto...AUC_test_OntoN_LENCycleTimeRANDCustom Vocab LenFull Vocab LenTesting SplitTraining Data FullTesting Data FullTraining Data FinalTesting Data Final
01110000.7126270.6587890.8169800.729856...141336.25290500742212604126042000101621753
11111000.7062740.6627810.8138950.731202...141328.37601700296712604126042000101621753
21110100.7260600.6811490.7986950.726875...141331.99005700742212604126042000101621753
31111100.7099610.6705470.7917960.730337...141325.85572000296712604126042000101621753
41110010.7032180.6442130.8242650.768131...141335.52799300742212604126042000101621753
51111010.6971650.6326390.8226270.739773...141328.37800000296712604126042000101621753
61110110.7238240.6495190.7821960.745600...141330.56500200742212604126042000101621753
71111110.7040800.6478820.7705380.716672...141325.69000000296712604126042000101621753
81100000.7042540.6580590.8129780.716202...141336.21899600742212604126042000101621753
91101000.7010960.6506160.8138770.722212...141328.41700100296712604126042000101621753
\n", "

10 rows × 25 columns

\n", "
" ], "text/plain": [ " TRUNC O-Core O-Core+Super DICT REPRESENT READ AUC_train AUC_test \\\n", "0 1 1 1 0 0 0 0.712627 0.658789 \n", "1 1 1 1 1 0 0 0.706274 0.662781 \n", "2 1 1 1 0 1 0 0.726060 0.681149 \n", "3 1 1 1 1 1 0 0.709961 0.670547 \n", "4 1 1 1 0 0 1 0.703218 0.644213 \n", "5 1 1 1 1 0 1 0.697165 0.632639 \n", "6 1 1 1 0 1 1 0.723824 0.649519 \n", "7 1 1 1 1 1 1 0.704080 0.647882 \n", "8 1 1 0 0 0 0 0.704254 0.658059 \n", "9 1 1 0 1 0 0 0.701096 0.650616 \n", "\n", " AUC_train_Onto AUC_test_Onto ... AUC_test_OntoN_LEN \\\n", "0 0.816980 0.729856 ... 1413 \n", "1 0.813895 0.731202 ... 1413 \n", "2 0.798695 0.726875 ... 1413 \n", "3 0.791796 0.730337 ... 1413 \n", "4 0.824265 0.768131 ... 1413 \n", "5 0.822627 0.739773 ... 1413 \n", "6 0.782196 0.745600 ... 1413 \n", "7 0.770538 0.716672 ... 1413 \n", "8 0.812978 0.716202 ... 1413 \n", "9 0.813877 0.722212 ... 1413 \n", "\n", " CycleTime RAND Custom Vocab Len Full Vocab Len Testing Split \\\n", "0 36.252905 0 0 7422 12604 \n", "1 28.376017 0 0 2967 12604 \n", "2 31.990057 0 0 7422 12604 \n", "3 25.855720 0 0 2967 12604 \n", "4 35.527993 0 0 7422 12604 \n", "5 28.378000 0 0 2967 12604 \n", "6 30.565002 0 0 7422 12604 \n", "7 25.690000 0 0 2967 12604 \n", "8 36.218996 0 0 7422 12604 \n", "9 28.417001 0 0 2967 12604 \n", "\n", " Training Data Full Testing Data Full Training Data Final \\\n", "0 12604 2000 10162 \n", "1 12604 2000 10162 \n", "2 12604 2000 10162 \n", "3 12604 2000 10162 \n", "4 12604 2000 10162 \n", "5 12604 2000 10162 \n", "6 12604 2000 10162 \n", "7 12604 2000 10162 \n", "8 12604 2000 10162 \n", "9 12604 2000 10162 \n", "\n", " Testing Data Final \n", "0 1753 \n", "1 1753 \n", "2 1753 \n", "3 1753 \n", "4 1753 \n", "5 1753 \n", "6 1753 \n", "7 1753 \n", "8 1753 \n", "9 1753 \n", "\n", "[10 rows x 25 columns]" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.head(10)" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(96, 25)" ] }, "execution_count": 146, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n", " 'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n", " 'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n", " 'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n", " 'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n", " 'Testing Split', 'Training Data Full', 'Testing Data Full',\n", " 'Training Data Final', 'Testing Data Final'],\n", " dtype='object')" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.columns" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], "source": [ "deltaColumns = [(\"D_\"+x) for x in preservedDataFrame.iloc[:,6:].columns]" ] }, { "cell_type": "code", "execution_count": 149, "metadata": { "scrolled": true }, "outputs": [], "source": [ "combos = list(itertools.combinations(list(preservedDataFrame.index),2))\n", "headers = ['TRUNC','O-Core','O-Core+Super','READ', 'REPRESENT','DICT','RAND']\n", "\n", "combolist = [list(x) for x in combos]\n", "\n", "pairwiseDataFrame = pd.DataFrame(columns=deltaColumns, index=range(len(combolist)))\n", "\n", "m1 = []\n", "m2 = []\n", "\n", "for i in combolist:\n", " m1.append(i[0])\n", " m2.append(i[1])\n", " \n", "pairwiseDataFrame['M1'] = m1\n", "pairwiseDataFrame['M2'] = m2" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4560, 21)" ] }, "execution_count": 150, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRUNC\n", "O-Core\n", "O-Core+Super\n", "READ\n", "REPRESENT\n", "DICT\n", "RAND\n" ] } ], "source": [ "for i in headers:\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',\n", " 'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',\n", " 'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',\n", " 'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',\n", " 'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',\n", " 'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2'],\n", " dtype='object')" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.columns" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_RANDD_Custom Vocab LenD_Full Vocab LenD_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN01
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN02
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN03
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN04
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN05
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN ... D_RAND \\\n", "0 NaN NaN NaN ... NaN \n", "1 NaN NaN NaN ... NaN \n", "2 NaN NaN NaN ... NaN \n", "3 NaN NaN NaN ... NaN \n", "4 NaN NaN NaN ... NaN \n", "\n", " D_Custom Vocab Len D_Full Vocab Len D_Testing Split D_Training Data Full \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_Testing Data Full D_Training Data Final D_Testing Data Final M1 M2 \n", "0 NaN NaN NaN 0 1 \n", "1 NaN NaN NaN 0 2 \n", "2 NaN NaN NaN 0 3 \n", "3 NaN NaN NaN 0 4 \n", "4 NaN NaN NaN 0 5 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4560, 21)" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "breakpoints = np.arange(0,1900000,1000)" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1000, 2000, ..., 1897000, 1898000, 1899000])" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "breakpoints" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "for k in range(len(breakpoints)-1):\n", " print(\"BREAKPOINT {}-{}\".format(breakpoints[k],breakpoints[k+1]))\n", " \n", " start = breakpoints[k]\n", " end = breakpoints[k+1] \n", " \n", " fseries = pd.Series()\n", "\n", " for i in tqdm.tqdm(pairwiseDataFrame.index[start:end]):\n", " m1 = pairwiseDataFrame.loc[i,'M1']\n", " m2 = pairwiseDataFrame.loc[i,'M2']\n", "\n", " factors = str()\n", " counter = 0\n", "\n", " if preservedDataFrame.loc[m1,headers[0]] != preservedDataFrame.loc[m2,headers[0]]:\n", " factors = factors + headers[0] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[1]] != preservedDataFrame.loc[m2,headers[1]]:\n", " factors = factors + headers[1] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[2]] != preservedDataFrame.loc[m2,headers[2]]:\n", " factors = factors + headers[2] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[3]] != preservedDataFrame.loc[m2,headers[3]]:\n", " factors = factors + headers[3] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[4]] != preservedDataFrame.loc[m2,headers[4]]:\n", " factors = factors + headers[4] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[5]] != preservedDataFrame.loc[m2,headers[5]]:\n", " factors = factors + headers[5] + \" \"\n", " counter += 1 \n", "\n", " if preservedDataFrame.loc[m1,headers[6]] != preservedDataFrame.loc[m2,headers[6]]:\n", " factors = factors + headers[6] + \" \"\n", " counter += 1 \n", " \n", " fseries.loc[i] = factors\n", "\n", "\n", " pairwiseDataFrame.loc[fseries.index,'Factor'] = fseries\n" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Custom Vocab LenD_Full Vocab LenD_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2Factor
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN01DICT
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN02REPRESENT
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN03REPRESENT DICT
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN04READ
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN05READ DICT
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " ... D_Custom Vocab Len D_Full Vocab Len D_Testing Split \\\n", "0 ... NaN NaN NaN \n", "1 ... NaN NaN NaN \n", "2 ... NaN NaN NaN \n", "3 ... NaN NaN NaN \n", "4 ... NaN NaN NaN \n", "\n", " D_Training Data Full D_Testing Data Full D_Training Data Final \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_Testing Data Final M1 M2 Factor \n", "0 NaN 0 1 DICT \n", "1 NaN 0 2 REPRESENT \n", "2 NaN 0 3 REPRESENT DICT \n", "3 NaN 0 4 READ \n", "4 NaN 0 5 READ DICT \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 160, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "0 DICT \n", "1 REPRESENT \n", "2 REPRESENT DICT \n", "3 READ \n", "4 READ DICT \n", "5 READ REPRESENT \n", "6 READ REPRESENT DICT \n", "7 O-Core+Super \n", "8 O-Core+Super DICT \n", "9 O-Core+Super REPRESENT \n", "10 O-Core+Super REPRESENT DICT \n", "11 O-Core+Super READ \n", "12 O-Core+Super READ DICT \n", "13 O-Core+Super READ REPRESENT \n", "14 O-Core+Super READ REPRESENT DICT \n", "15 O-Core O-Core+Super \n", "16 O-Core O-Core+Super DICT \n", "17 O-Core O-Core+Super REPRESENT \n", "18 O-Core O-Core+Super REPRESENT DICT \n", "19 O-Core O-Core+Super READ \n", "20 O-Core O-Core+Super READ DICT \n", "21 O-Core O-Core+Super READ REPRESENT \n", "22 O-Core O-Core+Super READ REPRESENT DICT \n", "23 TRUNC \n", "24 TRUNC DICT \n", "25 TRUNC REPRESENT \n", "26 TRUNC REPRESENT DICT \n", "27 TRUNC READ \n", "28 TRUNC READ DICT \n", "29 TRUNC READ REPRESENT \n", " ... \n", "4530 O-Core DICT \n", "4531 O-Core \n", "4532 DICT \n", "4533 REPRESENT \n", "4534 REPRESENT DICT \n", "4535 READ \n", "4536 READ DICT \n", "4537 READ REPRESENT \n", "4538 READ REPRESENT DICT \n", "4539 REPRESENT DICT \n", "4540 REPRESENT \n", "4541 READ DICT \n", "4542 READ \n", "4543 READ REPRESENT DICT \n", "4544 READ REPRESENT \n", "4545 DICT \n", "4546 READ REPRESENT \n", "4547 READ REPRESENT DICT \n", "4548 READ \n", "4549 READ DICT \n", "4550 READ REPRESENT DICT \n", "4551 READ REPRESENT \n", "4552 READ DICT \n", "4553 READ \n", "4554 DICT \n", "4555 REPRESENT \n", "4556 REPRESENT DICT \n", "4557 REPRESENT DICT \n", "4558 REPRESENT \n", "4559 DICT \n", "Name: Factor, Length: 4560, dtype: object" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame['Factor']" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Custom Vocab LenD_Full Vocab LenD_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2Factor
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN01DICT
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN02REPRESENT
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN03REPRESENT DICT
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN04READ
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN05READ DICT
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " ... D_Custom Vocab Len D_Full Vocab Len D_Testing Split \\\n", "0 ... NaN NaN NaN \n", "1 ... NaN NaN NaN \n", "2 ... NaN NaN NaN \n", "3 ... NaN NaN NaN \n", "4 ... NaN NaN NaN \n", "\n", " D_Training Data Full D_Testing Data Full D_Training Data Final \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_Testing Data Final M1 M2 Factor \n", "0 NaN 0 1 DICT \n", "1 NaN 0 2 REPRESENT \n", "2 NaN 0 3 REPRESENT DICT \n", "3 NaN 0 4 READ \n", "4 NaN 0 5 READ DICT \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "splits = pairwiseDataFrame['Factor'].str.split()" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame['FactorLength'] = [len(x) for x in splits]" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Full Vocab LenD_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2FactorFactorLength
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN01DICT1
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN02REPRESENT1
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN03REPRESENT DICT2
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN04READ1
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN05READ DICT2
\n", "

5 rows × 23 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " ... D_Full Vocab Len D_Testing Split D_Training Data Full \\\n", "0 ... NaN NaN NaN \n", "1 ... NaN NaN NaN \n", "2 ... NaN NaN NaN \n", "3 ... NaN NaN NaN \n", "4 ... NaN NaN NaN \n", "\n", " D_Testing Data Full D_Training Data Final D_Testing Data Final M1 M2 \\\n", "0 NaN NaN NaN 0 1 \n", "1 NaN NaN NaN 0 2 \n", "2 NaN NaN NaN 0 3 \n", "3 NaN NaN NaN 0 4 \n", "4 NaN NaN NaN 0 5 \n", "\n", " Factor FactorLength \n", "0 DICT 1 \n", "1 REPRESENT 1 \n", "2 REPRESENT DICT 2 \n", "3 READ 1 \n", "4 READ DICT 2 \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4560, 23)" ] }, "execution_count": 167, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame['Neighbor'] = np.where(pairwiseDataFrame['FactorLength']==1,1,0)" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2FactorFactorLengthNeighbor
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN01DICT11
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN02REPRESENT11
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN03REPRESENT DICT20
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN04READ11
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN05READ DICT20
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN ... \\\n", "0 NaN NaN NaN ... \n", "1 NaN NaN NaN ... \n", "2 NaN NaN NaN ... \n", "3 NaN NaN NaN ... \n", "4 NaN NaN NaN ... \n", "\n", " D_Testing Split D_Training Data Full D_Testing Data Full \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " D_Training Data Final D_Testing Data Final M1 M2 Factor \\\n", "0 NaN NaN 0 1 DICT \n", "1 NaN NaN 0 2 REPRESENT \n", "2 NaN NaN 0 3 REPRESENT DICT \n", "3 NaN NaN 0 4 READ \n", "4 NaN NaN 0 5 READ DICT \n", "\n", " FactorLength Neighbor \n", "0 1 1 \n", "1 1 1 \n", "2 2 0 \n", "3 1 1 \n", "4 2 0 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 169, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.head()" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4560, 24)" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pairwiseDataFrame.shape" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "filteredPairWise = pairwiseDataFrame.loc[pairwiseDataFrame['Neighbor']==1,].copy()" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(304, 24)" ] }, "execution_count": 174, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.shape" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [], "source": [ "filteredPairWise['Factor'] = filteredPairWise['Factor'].str.strip()" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2FactorFactorLengthNeighbor
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN01DICT11
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN02REPRESENT11
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN04READ11
7NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN08O-Core+Super11
23NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN024TRUNC11
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "7 NaN NaN NaN NaN \n", "23 NaN NaN NaN NaN \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "3 NaN NaN NaN \n", "7 NaN NaN NaN \n", "23 NaN NaN NaN \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "3 NaN NaN NaN \n", "7 NaN NaN NaN \n", "23 NaN NaN NaN \n", "\n", " ... D_Testing Split D_Training Data Full D_Testing Data Full \\\n", "0 ... NaN NaN NaN \n", "1 ... NaN NaN NaN \n", "3 ... NaN NaN NaN \n", "7 ... NaN NaN NaN \n", "23 ... NaN NaN NaN \n", "\n", " D_Training Data Final D_Testing Data Final M1 M2 Factor \\\n", "0 NaN NaN 0 1 DICT \n", "1 NaN NaN 0 2 REPRESENT \n", "3 NaN NaN 0 4 READ \n", "7 NaN NaN 0 8 O-Core+Super \n", "23 NaN NaN 0 24 TRUNC \n", "\n", " FactorLength Neighbor \n", "0 1 1 \n", "1 1 1 \n", "3 1 1 \n", "7 1 1 \n", "23 1 1 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 176, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.head()" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(304, 24)" ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.shape" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',\n", " 'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',\n", " 'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',\n", " 'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',\n", " 'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',\n", " 'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2', 'Factor',\n", " 'FactorLength', 'Neighbor'],\n", " dtype='object')" ] }, "execution_count": 178, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.columns" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n", " 'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n", " 'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n", " 'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n", " 'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n", " 'Testing Split', 'Training Data Full', 'Testing Data Full',\n", " 'Training Data Final', 'Testing Data Final'],\n", " dtype='object')" ] }, "execution_count": 179, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preservedDataFrame.columns" ] }, { "cell_type": "code", "execution_count": 180, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 304/304 [00:02<00:00, 123.83it/s]\n" ] } ], "source": [ "for i in tqdm.tqdm(filteredPairWise.index):\n", " for k in filteredPairWise.columns[:-5]:\n", " m1 = filteredPairWise.loc[i,'M1']\n", " m2 = filteredPairWise.loc[i,'M2']\n", " factor = filteredPairWise.loc[i,'Factor']\n", " try:\n", " if preservedDataFrame.loc[m1,factor] > preservedDataFrame.loc[m2,factor]:\n", " filteredPairWise.loc[i,k] = preservedDataFrame.loc[m1,k[2:]] - preservedDataFrame.loc[m2,k[2:]]\n", " else:\n", " filteredPairWise.loc[i,k] = preservedDataFrame.loc[m2,k[2:]] - preservedDataFrame.loc[m1,k[2:]]\n", " except:\n", " continue\n", "\n" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_trainD_AUC_testD_AUC_train_OntoD_AUC_test_OntoD_AUC_train_Not_OntoD_AUC_test_Not_OntoD_AUC_train_Onto_LEND_AUC_train_OntoN_LEND_AUC_test_Onto_LEND_AUC_test_OntoN_LEN...D_Testing SplitD_Training Data FullD_Testing Data FullD_Training Data FinalD_Testing Data FinalM1M2FactorFactorLengthNeighbor
0-0.0063530.003992-0.0030850.001346-0.0075750.0067430.00.00.00.0...0.00.00.00.00.001DICT11
10.0134330.022359-0.018285-0.0029810.0233400.0284720.00.00.00.0...0.00.00.00.00.002REPRESENT11
3-0.009409-0.0145760.0072860.038275-0.010127-0.0353040.00.00.00.0...0.00.00.00.00.004READ11
70.0083730.0007310.0040010.0136540.010689-0.0021680.00.00.00.0...0.00.00.00.00.008O-Core+Super11
230.0112280.033588-0.0179590.003631-0.0001280.0294111201.0-1201.0209.0-209.0...0.00.00.00.00.0024TRUNC11
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto \\\n", "0 -0.006353 0.003992 -0.003085 0.001346 \n", "1 0.013433 0.022359 -0.018285 -0.002981 \n", "3 -0.009409 -0.014576 0.007286 0.038275 \n", "7 0.008373 0.000731 0.004001 0.013654 \n", "23 0.011228 0.033588 -0.017959 0.003631 \n", "\n", " D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN \\\n", "0 -0.007575 0.006743 0.0 \n", "1 0.023340 0.028472 0.0 \n", "3 -0.010127 -0.035304 0.0 \n", "7 0.010689 -0.002168 0.0 \n", "23 -0.000128 0.029411 1201.0 \n", "\n", " D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN \\\n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "7 0.0 0.0 0.0 \n", "23 -1201.0 209.0 -209.0 \n", "\n", " ... D_Testing Split D_Training Data Full D_Testing Data Full \\\n", "0 ... 0.0 0.0 0.0 \n", "1 ... 0.0 0.0 0.0 \n", "3 ... 0.0 0.0 0.0 \n", "7 ... 0.0 0.0 0.0 \n", "23 ... 0.0 0.0 0.0 \n", "\n", " D_Training Data Final D_Testing Data Final M1 M2 Factor \\\n", "0 0.0 0.0 0 1 DICT \n", "1 0.0 0.0 0 2 REPRESENT \n", "3 0.0 0.0 0 4 READ \n", "7 0.0 0.0 0 8 O-Core+Super \n", "23 0.0 0.0 0 24 TRUNC \n", "\n", " FactorLength Neighbor \n", "0 1 1 \n", "1 1 1 \n", "3 1 1 \n", "7 1 1 \n", "23 1 1 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 181, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.head()" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(304, 24)" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.shape" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [], "source": [ "filteredPairWise.to_csv('{}_filteredPairWise.csv'.format(today))" ] }, { "cell_type": "code", "execution_count": 184, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meanstd
D_AUC_train0.8663740.825970
D_AUC_test0.2385950.783248
D_AUC_train_Onto7.6466773.262931
D_AUC_test_Onto6.4268102.847169
D_AUC_train_Not_Onto-1.6752740.740432
D_AUC_test_Not_Onto-2.4863231.080627
D_AUC_train_Onto_LEN243250.00000061010.861011
D_AUC_train_OntoN_LEN772950.00000061010.861011
D_AUC_test_Onto_LEN44450.00000010617.210617
D_AUC_test_OntoN_LEN130850.00000010617.210617
D_CycleTime26.308768111.145037
D_RAND0.0000000.000000
D_Custom Vocab Len0.0000000.000000
D_Full Vocab Len0.0000000.000000
D_Testing Split0.0000000.000000
D_Training Data Full0.0000000.000000
D_Testing Data Full0.0000000.000000
D_Training Data Final0.0000000.000000
D_Testing Data Final0.0000000.000000
M14750.0000002736.138084
M25550.0000002736.138084
FactorLength100.0000000.000000
Neighbor100.0000000.000000
\n", "
" ], "text/plain": [ " mean std\n", "D_AUC_train 0.866374 0.825970\n", "D_AUC_test 0.238595 0.783248\n", "D_AUC_train_Onto 7.646677 3.262931\n", "D_AUC_test_Onto 6.426810 2.847169\n", "D_AUC_train_Not_Onto -1.675274 0.740432\n", "D_AUC_test_Not_Onto -2.486323 1.080627\n", "D_AUC_train_Onto_LEN 243250.000000 61010.861011\n", "D_AUC_train_OntoN_LEN 772950.000000 61010.861011\n", "D_AUC_test_Onto_LEN 44450.000000 10617.210617\n", "D_AUC_test_OntoN_LEN 130850.000000 10617.210617\n", "D_CycleTime 26.308768 111.145037\n", "D_RAND 0.000000 0.000000\n", "D_Custom Vocab Len 0.000000 0.000000\n", "D_Full Vocab Len 0.000000 0.000000\n", "D_Testing Split 0.000000 0.000000\n", "D_Training Data Full 0.000000 0.000000\n", "D_Testing Data Full 0.000000 0.000000\n", "D_Training Data Final 0.000000 0.000000\n", "D_Testing Data Final 0.000000 0.000000\n", "M1 4750.000000 2736.138084\n", "M2 5550.000000 2736.138084\n", "FactorLength 100.000000 0.000000\n", "Neighbor 100.000000 0.000000" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].describe()[1:3].transpose()*100" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jw\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1472: FutureWarning: \n", "Passing list-likes to .loc or [] with any missing label will raise\n", "KeyError in the future, you can use .reindex() as an alternative.\n", "\n", "See the documentation here:\n", "https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", " return self._getitem_tuple(key)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
D_AUC_testD_AUC_test__Onto...D_AUC_trainD_AUC_train__Onto
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
Factor
DICT48.0-0.0031970.010142-0.034942-0.009256-0.0036480.0049840.0145870.0NaN...0.0010000.0129560.0NaNNaNNaNNaNNaNNaNNaN
O-Core32.00.0023860.007832-0.010105-0.0032250.0015900.0056360.0218690.0NaN...0.0157270.0292590.0NaNNaNNaNNaNNaNNaNNaN
O-Core+Super32.00.0011540.005003-0.010379-0.0024490.0013530.0048690.0121650.0NaN...0.0015840.0097330.0NaNNaNNaNNaNNaNNaNNaN
RAND48.00.0045870.010032-0.018929-0.0004770.0049200.0107520.0277610.0NaN...0.0027070.0272870.0NaNNaNNaNNaNNaNNaNNaN
READ48.0-0.0232030.009845-0.038659-0.031895-0.023738-0.0144590.0019180.0NaN...0.0131420.0233270.0NaNNaNNaNNaNNaNNaNNaN
REPRESENT48.00.0120020.010824-0.0155640.0065470.0122380.0195250.0311530.0NaN...0.0378940.0665630.0NaNNaNNaNNaNNaNNaNNaN
TRUNC48.00.0353390.0121150.0089780.0268020.0372890.0437670.0608910.0NaN...0.0488200.0614480.0NaNNaNNaNNaNNaNNaNNaN
\n", "

7 rows × 32 columns

\n", "
" ], "text/plain": [ " D_AUC_test \\\n", " count mean std min 25% 50% \n", "Factor \n", "DICT 48.0 -0.003197 0.010142 -0.034942 -0.009256 -0.003648 \n", "O-Core 32.0 0.002386 0.007832 -0.010105 -0.003225 0.001590 \n", "O-Core+Super 32.0 0.001154 0.005003 -0.010379 -0.002449 0.001353 \n", "RAND 48.0 0.004587 0.010032 -0.018929 -0.000477 0.004920 \n", "READ 48.0 -0.023203 0.009845 -0.038659 -0.031895 -0.023738 \n", "REPRESENT 48.0 0.012002 0.010824 -0.015564 0.006547 0.012238 \n", "TRUNC 48.0 0.035339 0.012115 0.008978 0.026802 0.037289 \n", "\n", " D_AUC_test__Onto ... D_AUC_train \\\n", " 75% max count mean ... 75% \n", "Factor ... \n", "DICT 0.004984 0.014587 0.0 NaN ... 0.001000 \n", "O-Core 0.005636 0.021869 0.0 NaN ... 0.015727 \n", "O-Core+Super 0.004869 0.012165 0.0 NaN ... 0.001584 \n", "RAND 0.010752 0.027761 0.0 NaN ... 0.002707 \n", "READ -0.014459 0.001918 0.0 NaN ... 0.013142 \n", "REPRESENT 0.019525 0.031153 0.0 NaN ... 0.037894 \n", "TRUNC 0.043767 0.060891 0.0 NaN ... 0.048820 \n", "\n", " D_AUC_train__Onto \n", " max count mean std min 25% 50% 75% max \n", "Factor \n", "DICT 0.012956 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "O-Core 0.029259 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "O-Core+Super 0.009733 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "RAND 0.027287 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "READ 0.023327 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "REPRESENT 0.066563 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "TRUNC 0.061448 0.0 NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[7 rows x 32 columns]" ] }, "execution_count": 185, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.loc[:,['Factor','D_AUC_train','D_AUC_test','D_AUC_train__Onto','D_AUC_test__Onto',]].sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe()" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32, 24)" ] }, "execution_count": 186, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].shape" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "filteredPairWise.sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe().to_csv(\"{}_consolidated.csv\".format(today))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ONLY RUN THIS CODE WHEN PRODUCING A SINGLE MODEL-RUN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### This section is useful for analyzing appropriate LDA topic model sizes and Doc2Vec vector performance" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [], "source": [ "# for i in trainDataFinal['filteredTexts'][:20]:\n", "# print(i)\n", "# print()\n", " \n", "# print(trainDataFinal.loc[11639,\"tokenizedTexts\"])" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "# tokenizedTexts = trainDataFinal.loc[:,\"tokenizedTexts\"].copy()\n", "\n", "# cdict = Dictionary(tokenizedTexts)\n", "\n", "# tokenizedList_bow = [cdict.doc2bow(x) for x in tokenizedTexts]\n", "\n", "# x = list(np.arange(2,200,10))\n", "# perp = []\n", "\n", "# for i in tqdm.tqdm(x):\n", "# print(i)\n", "# ldaModel = LdaModel(\n", "# corpus=tokenizedList_bow,\n", "# id2word=cdict, \n", "# num_topics=i,\n", "# random_state=42,\n", "# passes=5,\n", "# alpha=1.0/i,\n", "# eta=1.0/i, \n", "# eval_every=1000, \n", "# iterations=5, \n", "# )\n", "# p = ldaModel.log_perplexity(tokenizedList_bow)\n", "# print(p)\n", "# perp.append(p)\n", " \n", "# sns.set(font_scale=1.25)\n", "# plt.figure(figsize=(12,8))\n", "# ax = sns.lineplot(x=x,y=perp,palette='colorblind')\n", "# ax.set_title(\"LDA Perplexity\")\n", "# ax.set(xlabel='Number of Topics in LDA Model', ylabel='Perplexity')\n", "# ax.spines['bottom'].set_color('0.25')\n", "# ax.spines['top'].set_color('0.25')\n", "# ax.spines['right'].set_color('0.25')\n", "# ax.spines['left'].set_color('0.25')\n", "# plt.savefig(\"plots/LDAperplexity.png\")\n", "# plt.show()" ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [], "source": [ "# df = pd.DataFrame(columns=[\n", "# 'Tag',\n", "# 'Train Tagged',\n", "# 'Train Not Tagged',\n", "# 'Train diff',\n", "# 'Train ttest',\n", "# 'Train pvalue',\n", "# 'Train Tagged Count',\n", "# 'Train Not Tagged Count',\n", "# 'Test Tagged',\n", "# 'Test Not Tagged',\n", "# 'Test diff',\n", "# 'Test ttest',\n", "# 'Test pvalue',\n", "# 'Test Tagged Count',\n", "# 'Test Not Tagged Count',\n", "# 'vs tagEASIER',\n", "# 'vs tagHARDER',\n", "# 'Intended Label',\n", "# 'Actual Label',\n", "# ])\n", "\n", "# cosinePerformance = []\n", "\n", "# for i in tagVectors:\n", "# tag = str(i[0])\n", "# testList = []\n", "# testNOTList = []\n", "# trainList = []\n", "# trainNOTList = []\n", "# for k in trainDataFinal.loc[trainFinalRep.index,:].index:\n", "# if tag in trainDataFinal.loc[k,'manualTags']:\n", "# trainList.append(k)\n", "# else:\n", "# trainNOTList.append(k)\n", " \n", "# for k in testDataFinal.loc[testFinalRep.index,:].index:\n", "# if tag in testDataFinal.loc[k,'manualTags']:\n", "# testList.append(k)\n", "# else:\n", "# testNOTList.append(k)\n", " \n", "# reps = pd.concat([trainFinalRep,testFinalRep])\n", " \n", " \n", "# testListCos = []\n", "# testNOTListCos = []\n", "# trainListCos = []\n", "# trainNOTListCos = [] \n", " \n", " \n", "# for k in testList:\n", "# testListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n", " \n", "# for k in testNOTList:\n", "# testNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n", " \n", "# for k in trainList:\n", "# trainListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n", " \n", "# for k in trainNOTList:\n", "# trainNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n", "\n", " \n", "# testTTEST = ttest_ind(a=testListCos,b=testNOTListCos,equal_var=False)\n", "# trainTTEST = ttest_ind(a=trainListCos,b=trainNOTListCos,equal_var=False)\n", " \n", "# vseasy = cosine_similarity([i[1],d2vModel['tagEASIER']])[0,1]\n", "# vshard = cosine_similarity([i[1],d2vModel['tagHARDER']])[0,1]\n", " \n", "# if i[0] in tags[:11]:\n", "# label = 'EASY'\n", "# else:\n", "# label = 'HARD'\n", " \n", "# if vseasy > vshard:\n", "# actual = 'EASY'\n", "# else:\n", "# actual = 'HARD'\n", " \n", " \n", "# sample = {\n", "# 'Tag':i[0],\n", " \n", "# 'Train Tagged':np.mean(trainListCos),\n", "# 'Train Not Tagged':np.mean(trainNOTListCos),\n", "# 'Train diff':np.abs(np.mean(trainListCos)-np.mean(trainNOTListCos)),\n", " \n", "# 'Train ttest':trainTTEST[0],\n", "# 'Train pvalue':trainTTEST[1],\n", "# 'Train Tagged Count':len(trainList),\n", "# 'Train Not Tagged Count':len(trainNOTList),\n", " \n", "# 'Test Tagged':np.mean(testListCos),\n", "# 'Test Not Tagged':np.mean(testNOTListCos),\n", "# 'Test diff':np.abs(np.mean(testListCos)-np.mean(testNOTListCos)),\n", " \n", "# 'Test ttest':testTTEST[0],\n", "# 'Test pvalue':testTTEST[1],\n", "# 'Test Tagged Count':len(testList),\n", "# 'Test Not Tagged Count':len(testNOTList),\n", " \n", "# 'vs tagEASIER':vseasy,\n", "# 'vs tagHARDER':vshard,\n", "# 'Intended Label':label,\n", "# 'Actual Label':actual,\n", " \n", "# }\n", "# # print(sample)\n", "# cosinePerformance.append(sample)\n", "# pd.DataFrame(cosinePerformance,columns=df.columns).to_csv('ttestOntology.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }