{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: PYTHONHASHSEED=1\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import time\n",
    "import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.metrics import roc_auc_score, accuracy_score\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from scipy.stats import ttest_ind\n",
    "\n",
    "%set_env PYTHONHASHSEED=1\n",
    "# %matplotlib inline\n",
    "import itertools\n",
    "import tqdm as tqdm\n",
    "\n",
    "from gensim.test.utils import common_texts, get_tmpfile\n",
    "from gensim.models import Word2Vec, Doc2Vec, LdaModel, TfidfModel\n",
    "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n",
    "from gensim.test.utils import common_corpus, common_dictionary\n",
    "from gensim.parsing.preprocessing import strip_punctuation, strip_short, strip_non_alphanum, strip_tags, strip_multiple_whitespaces, remove_stopwords\n",
    "from gensim.parsing.preprocessing import preprocess_string\n",
    "from gensim.corpora import Dictionary\n",
    "from gensim.parsing.preprocessing import STOPWORDS\n",
    "from gensim.matutils import corpus2dense"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "today = re.sub(string=str(datetime.datetime.today()), pattern=\"\\W\", repl=\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'20190528085750160508'"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "today"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14897, 34)"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata = pd.read_json('full.json')\n",
    "rawDdata.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawDdata['Start Date'] =  pd.to_datetime(rawDdata['Start Date'],unit='ms')\n",
    "rawDdata['End Date'] =  pd.to_datetime(rawDdata['End Date'],unit='ms')\n",
    "rawDdata.to_csv(\"fiddlingRaw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14604, 34)"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata = rawDdata[rawDdata['segCode']==0]\n",
    "rawDdata.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawDdata = rawDdata.sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Id                                                                       39359102\n",
       "Start Date                                                    2018-02-28 00:00:00\n",
       "Start Time                                                               16:59:00\n",
       "End Date                                                      2018-03-06 00:00:00\n",
       "End Time                                                                     NULL\n",
       "Duration                                                                     NULL\n",
       "Entered by                                                          Jason Coleman\n",
       "Notes                                                                        NULL\n",
       "Question                                             Trouble locating QL49 .J4453\n",
       "Answer                          4:59 25870693270869823292870615 transfer from ...\n",
       "Notes FULL                                                                   NULL\n",
       "Library Dept./Branch/Service                                    Hale Library Help\n",
       "Where were you?                                                      Service Desk\n",
       "Who answers                                                                   jmc\n",
       "Who Asked?                                                          KSU undergrad\n",
       "How many in the group?                                                       NULL\n",
       "Question Format                                                                im\n",
       "Question Type                                                         Directional\n",
       "Referred to?                                                                 NULL\n",
       "READ                                                                            2\n",
       "Time Spent                                                               6-10 min\n",
       "Class/Discipline                                                             NULL\n",
       "tags                                                                         NULL\n",
       "Room reservation                                                             NULL\n",
       "Reported to:                                                                 NULL\n",
       "READ_1_vs_2                                                                     1\n",
       "READ_2_vs_3                                                                     0\n",
       "Transcript                      4:59 25870693270869823292870615 transfer from ...\n",
       "TransLength                                                                  1178\n",
       "segTrans                        [[4:59, patron, Hi I can't find my book], [4:5...\n",
       "segCode                                                                         0\n",
       "PatronTextString                Hi I can't find my book I can't find the QL I ...\n",
       "StaffTextString                 Ok. Which one are you looking for? Are you in ...\n",
       "AllTextString                   Hi I can't find my book Ok. Which one are you ...\n",
       "Name: 13751, dtype: object"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata.loc[13751,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawDdata = rawDdata.sort_values(by='Start Date')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "rawDdata.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32.71 : Splitting Patron Lines\n"
     ]
    }
   ],
   "source": [
    "def getPatronSections(df,breaks):\n",
    "    st = time.time()\n",
    "    \n",
    "    df = df.copy()\n",
    "    corpus = df['PatronTextString'].str.split(pat=\"\\s{1,}\").copy()\n",
    "    \n",
    "    for i in corpus.index:\n",
    "        \n",
    "        df.loc[i,'First5'] = \" \".join(corpus.loc[i][:breaks[0]])\n",
    "        df.loc[i,'First10'] = \" \".join(corpus.loc[i][:breaks[1]])\n",
    "        df.loc[i,'First20'] = \" \".join(corpus.loc[i][:breaks[2]])\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Splitting Patron Lines'.format(et))    \n",
    "    return(df)  \n",
    "\n",
    "patronSegmentsOptions = [5,10,20]\n",
    "rawDdata = getPatronSections(df=rawDdata,breaks=patronSegmentsOptions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Id                                                                       39359102\n",
       "Start Date                                                    2018-02-28 00:00:00\n",
       "Start Time                                                               16:59:00\n",
       "End Date                                                      2018-03-06 00:00:00\n",
       "End Time                                                                     NULL\n",
       "Duration                                                                     NULL\n",
       "Entered by                                                          Jason Coleman\n",
       "Notes                                                                        NULL\n",
       "Question                                             Trouble locating QL49 .J4453\n",
       "Answer                          4:59 25870693270869823292870615 transfer from ...\n",
       "Notes FULL                                                                   NULL\n",
       "Library Dept./Branch/Service                                    Hale Library Help\n",
       "Where were you?                                                      Service Desk\n",
       "Who answers                                                                   jmc\n",
       "Who Asked?                                                          KSU undergrad\n",
       "How many in the group?                                                       NULL\n",
       "Question Format                                                                im\n",
       "Question Type                                                         Directional\n",
       "Referred to?                                                                 NULL\n",
       "READ                                                                            2\n",
       "Time Spent                                                               6-10 min\n",
       "Class/Discipline                                                             NULL\n",
       "tags                                                                         NULL\n",
       "Room reservation                                                             NULL\n",
       "Reported to:                                                                 NULL\n",
       "READ_1_vs_2                                                                     1\n",
       "READ_2_vs_3                                                                     0\n",
       "Transcript                      4:59 25870693270869823292870615 transfer from ...\n",
       "TransLength                                                                  1178\n",
       "segTrans                        [[4:59, patron, Hi I can't find my book], [4:5...\n",
       "segCode                                                                         0\n",
       "PatronTextString                Hi I can't find my book I can't find the QL I ...\n",
       "StaffTextString                 Ok. Which one are you looking for? Are you in ...\n",
       "AllTextString                   Hi I can't find my book Ok. Which one are you ...\n",
       "First5                                                         Hi I can't find my\n",
       "First10                                  Hi I can't find my book I can't find the\n",
       "First20                         Hi I can't find my book I can't find the QL I ...\n",
       "Name: 13751, dtype: object"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata.loc[13751,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Id                                       int64\n",
       "Start Date                      datetime64[ns]\n",
       "Start Time                              object\n",
       "End Date                        datetime64[ns]\n",
       "End Time                                object\n",
       "Duration                                object\n",
       "Entered by                              object\n",
       "Notes                                   object\n",
       "Question                                object\n",
       "Answer                                  object\n",
       "Notes FULL                              object\n",
       "Library Dept./Branch/Service            object\n",
       "Where were you?                         object\n",
       "Who answers                             object\n",
       "Who Asked?                              object\n",
       "How many in the group?                  object\n",
       "Question Format                         object\n",
       "Question Type                           object\n",
       "Referred to?                            object\n",
       "READ                                    object\n",
       "Time Spent                              object\n",
       "Class/Discipline                        object\n",
       "tags                                    object\n",
       "Room reservation                        object\n",
       "Reported to:                            object\n",
       "READ_1_vs_2                              int64\n",
       "READ_2_vs_3                              int64\n",
       "Transcript                              object\n",
       "TransLength                              int64\n",
       "segTrans                                object\n",
       "segCode                                  int64\n",
       "PatronTextString                        object\n",
       "StaffTextString                         object\n",
       "AllTextString                           object\n",
       "First5                                  object\n",
       "First10                                 object\n",
       "First20                                 object\n",
       "dtype: object"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Id', 'Start Date', 'Start Time', 'End Date', 'End Time', 'Duration',\n",
       "       'Entered by', 'Notes', 'Question', 'Answer', 'Notes FULL',\n",
       "       'Library Dept./Branch/Service', 'Where were you?', 'Who answers',\n",
       "       'Who Asked?', 'How many in the group?', 'Question Format',\n",
       "       'Question Type', 'Referred to?', 'READ', 'Time Spent',\n",
       "       'Class/Discipline', 'tags', 'Room reservation', 'Reported to:',\n",
       "       'READ_1_vs_2', 'READ_2_vs_3', 'Transcript', 'TransLength', 'segTrans',\n",
       "       'segCode', 'PatronTextString', 'StaffTextString', 'AllTextString',\n",
       "       'First5', 'First10', 'First20'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawDdata.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "manualTags = [\n",
    "    ['tagURL',[\n",
    "        re.escape('amazon.com'),\n",
    "        re.escape('newfirstsearch'),\n",
    "        re.escape('galegroup'),\n",
    "        re.escape('ingentaconnect.com'),\n",
    "        re.escape('proquest.com'),\n",
    "        re.escape('ncbi.nlm.nih.gov'),\n",
    "        re.escape('sciencedirect.com'),\n",
    "        re.escape('springer.com'),\n",
    "        re.escape('tandfonline.com'),\n",
    "        re.escape('webofknowledge'),\n",
    "        re.escape('wiley.com'),\n",
    "        re.escape('books.google'),\n",
    "        re.escape('google.com'),\n",
    "\n",
    "        re.escape('apps.lib.k-state.edu/databases'),\n",
    "\n",
    "        re.escape('er.lib.ksu.edu'),\n",
    "        re.escape('er.lib.k-state.edu'),\n",
    "\n",
    "        re.escape('getit.lib.ksu.edu'),\n",
    "        re.escape('getit.lib.k-state.edu'),\n",
    "\n",
    "        re.escape('guides.lib.ksu.edu'),\n",
    "        re.escape('guides.lib.k-state.edu'),\n",
    "\n",
    "        re.escape('catalog.lib.ksu.edu'),\n",
    "        re.escape('catalog2.lib.ksu.edu'),\n",
    "        re.escape('catalog.lib.k-state.edu'),\n",
    "        re.escape('catalog2.lib.k-state.edu'),\n",
    "\n",
    "        re.escape('primo.hosted.exlibrisgroup.com'),\n",
    "        re.escape('na02.alma.exlibrisgroup'),\n",
    "\n",
    "        re.escape('searchit.lib.ksu.edu'),\n",
    "        re.escape('searchit.lib.k-state.edu'),\n",
    "\n",
    "        re.escape('lib.k-state.edu'),\n",
    "        re.escape('lib.k-state.edu'),\n",
    "\n",
    "        re.escape('doi.org'),\n",
    "\n",
    "        re.escape('http'),\n",
    "        re.escape('www.'),]\n",
    "    ],\n",
    "    \n",
    "    ['tagPRINTING',[\n",
    "        'color print',\n",
    "        'colored print',\n",
    "        'print in color',\n",
    "        'print something in color',\n",
    "        \"\\Win color\\W\",\n",
    "        \"cat cash\",\n",
    "        'printer',\n",
    "        '(?<!3D\\s)\\bprinting',\n",
    "        'double.{1}sided',\n",
    "        'catcash',\n",
    "        'cat cash',\n",
    "        'add money',]\n",
    "    ],    \n",
    "    \n",
    "    ['tagSCANNER',[\n",
    "        'scanner',\n",
    "        '\\Wscan\\W',]\n",
    "    ],      \n",
    "    \n",
    "    ['tagHOURS',[\n",
    "        'open 24/7',\n",
    "        'what time',\n",
    "        'the hours',\n",
    "        'opens{0,1}\\W',\n",
    "        'will be open',\n",
    "        'summer hours',\n",
    "        'library hours',]\n",
    "    ],    \n",
    "    \n",
    "    \n",
    "    ['tagLIBMATHPHYS',[\n",
    "        re.escape('Math/Physics Library'),\n",
    "        re.escape('math and physics library'),\n",
    "        re.escape('Math Physic library'),\n",
    "        re.escape('math/physics library'),\n",
    "        re.escape('maths/phys library'),\n",
    "        re.escape('math & phys library'),\n",
    "        re.escape('math phys library'),]\n",
    "    ],\n",
    "    \n",
    "    ['tagLIBWEIGEL',[\n",
    "        'weigel',\n",
    "        'wiegel',]\n",
    "    ],\n",
    "    \n",
    "    ['tagLIBVETMED',[\n",
    "        'vet med',\n",
    "        'vetmed',]\n",
    "    ],\n",
    "    \n",
    "    ['tagLIBHALE',[\n",
    "        'Hale Library',\n",
    "        \"(?<!help\\s)hale\",]\n",
    "    ],\n",
    "    \n",
    "    ['tagLIBSTACKS',[\n",
    "        'Library Stacks',\n",
    "        'the stacks',\n",
    "        'in Stacks',]\n",
    "    ],\n",
    "    \n",
    "    ['tagTEXTBOOKS',[\n",
    "        'the reserve',\n",
    "        'on reserve',\n",
    "        'course reserve',\n",
    "        'reserve textbook',\n",
    "        'have a specific textbook',\n",
    "        'have the textbook',\n",
    "        'have textbook',\n",
    "        'this textbook',\n",
    "        'this text book',]\n",
    "    ],\n",
    "    \n",
    "    ['tagQUIET',[\n",
    "        'quite loud',\n",
    "        'super loud',\n",
    "        'really loud',\n",
    "        'very loud',\n",
    "        'stop talking',\n",
    "        'talking on',\n",
    "        'music loud',\n",
    "        \"loud\",\n",
    "        \"talking very\",\n",
    "        \"talking extremely\",\n",
    "        \"talking loud\",\n",
    "        'quiet floor',\n",
    "        '\"quiet\" floor',\n",
    "        'Quiet Zone',\n",
    "        'quiet floors',\n",
    "        'floor to be quiet',\n",
    "        'whisper quietly',\n",
    "        'be quiet',\n",
    "        'floor to be quiet',]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "    ['tagLIBLOCATION',[\n",
    "        'first floor',\n",
    "        '1st floor',\n",
    "        'second floor',\n",
    "        '2nd floor',\n",
    "        'third floor',\n",
    "        '3rd floor',\n",
    "        'fourth floor',\n",
    "        '4t floor',\n",
    "        'fifth floor',\n",
    "        '5th floor',\n",
    "        'hemisphere room',\n",
    "        'Harry Potter room',\n",
    "        'the hemi',]\n",
    "    ],\n",
    "    \n",
    "    ['tagARTICLES',[\n",
    "        \"peer.{,1}review\",\n",
    "        'journal article',\n",
    "        'scholarly article',\n",
    "        'scholarly journal',\n",
    "        '\"scholarly article',\n",
    "        'peer reviewed',\n",
    "        re.escape('peer-reviewed'),\n",
    "        'peerreviewed',\n",
    "        'scholarly',\n",
    "        'articles',]\n",
    "    ],\n",
    "    \n",
    "    ['tagEVIDENCEBASED',[\n",
    "        \"evidence.based\",\n",
    "        \"kinesiology\",]\n",
    "    ],\n",
    "    \n",
    "    ['tagJUVENILE',[\n",
    "        \"juv lit section\",\n",
    "        \"Juvenile Literature\",\n",
    "        re.escape(\"juv. lit\"),\n",
    "        \"children'{0,1}s collection\",\n",
    "        \"children'{0,1}s lit\",\n",
    "        \"children'{0,1}s stor\",\n",
    "        re.escape(\"children's boooks\"),\n",
    "        \"(?<!Germany on English )children'{0,1}s book\",\n",
    "        re.escape(\"children's picture\"),\n",
    "        \"picture book\",]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "    ['tagCURRICULUM',[\n",
    "        \"curriculum materials\",\n",
    "        \"curriculum books\",]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "    ['tagKNOWNITEMARTICLE',[\n",
    "        \"doi\\W\\s{,1}\\S+\",\n",
    "        \"doi:{0,1}\\s{0,1}\\d\\S+\",\n",
    "        \"this article\",\n",
    "        \"this\\s\\w+\\sarticle\",\n",
    "        'this paper',\n",
    "        'doi\\.\\S+',\n",
    "        'doi:{0,1}\\s{0,1}\\d\\S+',\n",
    "        'doi\\.org\\S+',]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "    ['tagKNOWNITEMBOOK',[\n",
    "        \"[a-z]{1,2}\\d{2,4}\\s{0,1}\\.[a-z]\\d{1,}\",\n",
    "        'this book',]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "    \n",
    "    ['tagREFERENCE',[\n",
    "        \"articles{0,1}\\sabout\",\n",
    "        \"books{0,1}\\sabout\",\n",
    "        \"subject\",\n",
    "        \"topic\",\n",
    "        \"a paper on\",\n",
    "        \"help me find an{0,1}\"]\n",
    "    ],\n",
    "    \n",
    "    \n",
    "]\n",
    "\n",
    "rollupTags = [\n",
    "    ['tagEASIER',[\n",
    "        'tagKNOWNITEMBOOK',\n",
    "        'tagLIBHALE',\n",
    "        'tagLIBLOCATION',\n",
    "        'tagLIBMATHPHYS',\n",
    "        'tagLIBSTACKS',\n",
    "        'tagLIBWEIGEL',\n",
    "        'tagHOURS',\n",
    "        'tagPRINTING',\n",
    "        'tagQUIET',\n",
    "        'tagSCANNER',\n",
    "        'tagTEXTBOOKS',]\n",
    "    ],\n",
    "    \n",
    "    ['tagHARDER',[\n",
    "        'tagARTICLES',\n",
    "        'tagCURRICULUM',\n",
    "        'tagEVIDENCEBASED',\n",
    "        'tagJUVENILE',\n",
    "        'tagKNOWNITEMARTICLE',\n",
    "        'tagREFERENCE',\n",
    "        'tagURL',]\n",
    "    ],\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getManualTags(df,collectTags,manualTagsList,section,rollups):\n",
    "    st = time.time()\n",
    "    \n",
    "    df = df.copy()\n",
    "    df['manualTags'] = [[] for i in range(df.shape[0])] \n",
    "    \n",
    "    if collectTags == True:\n",
    "        taglist = pd.Series()\n",
    "        for i in df.index:\n",
    "            tags = []\n",
    "            for k in manualTagsList:\n",
    "                if k[0] in tags:\n",
    "                    continue\n",
    "                for m in k[1]:\n",
    "                    if re.search(pattern=m,flags=re.IGNORECASE,string=df.loc[i,section]):\n",
    "                        tags.append(k[0])                \n",
    "            if rollups == True:\n",
    "                for n in rollupTags:\n",
    "                    if n[0] in tags:\n",
    "                        continue\n",
    "                    for o in n[1]:\n",
    "                        if o in tags:\n",
    "                            tags.append(n[0])\n",
    "            tags = list(np.unique(tags))\n",
    "            taglist.loc[i] = tags\n",
    "        df.loc[taglist.index,'manualTags'] = taglist\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Getting Manual Tags'.format(et))\n",
    "    if rollups:\n",
    "        print('Including Rollup Tags')\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getTextsTokens(df,corpus,model):\n",
    "    st = time.time()\n",
    "    \n",
    "    corpus = corpus.copy()\n",
    "    df = df.copy()\n",
    "    CUSTOM_FILTERS = [lambda x: x.lower(),\n",
    "                  strip_multiple_whitespaces,\n",
    "#                   strip_non_alphanum,\n",
    "#                   strip_short,\n",
    "                 ]\n",
    "    \n",
    "    tokenizedTexts = [preprocess_string(x, CUSTOM_FILTERS) for x in corpus]\n",
    "    tokenizedTexts = pd.Series(tokenizedTexts,index=corpus.index)\n",
    "    \n",
    "    \n",
    "#     if model == 'lda':\n",
    "#         for i in df.index:\n",
    "#             for k in df.loc[i,'DocTags']:\n",
    "#                 tokenizedTexts.loc[i] = tokenizedTexts.loc[i].append(k)\n",
    "    \n",
    "    gensim_dictionary = Dictionary(tokenizedTexts)\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Tokenizing Texts'.format(et))\n",
    "    return(tokenizedTexts,gensim_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def getTFIDFlimited(df, section, truncate, controlVocab, model):\n",
    "    st = time.time()\n",
    "    df = df.copy()\n",
    "    texts = df.loc[:,section].copy()\n",
    "    \n",
    "    fullVocab = CountVectorizer().fit(texts).get_feature_names()\n",
    "    texts = texts.str.lower().copy()\n",
    "    \n",
    "    if truncate[0] == True:\n",
    "        count = CountVectorizer(\n",
    "            token_pattern=r\"(?u)\\b\\w{3,}\\b\",\n",
    "            min_df=truncate[1],\n",
    "        )\n",
    "    elif truncate[0] == False:\n",
    "        count = CountVectorizer(\n",
    "            token_pattern=r\"(?u)\\b\\w{1,}\\b\",\n",
    "        )\n",
    "    \n",
    "    countDF = pd.DataFrame(count.fit_transform(texts).todense(),\n",
    "                           index=df.index,\n",
    "                           columns=count.get_feature_names())\n",
    "        \n",
    "    tfidf = TfidfTransformer()\n",
    "    tfidfDF = pd.DataFrame(tfidf.fit_transform(countDF).todense(),\n",
    "                           index=df.index,\n",
    "                           columns=count.get_feature_names())    \n",
    "    countList = countDF.sum().sort_values(ascending=False)\n",
    "    tfidfList = tfidfDF.mean().sort_values(ascending=False)\n",
    "    \n",
    "    \n",
    "    if truncate[0] == True:\n",
    "        vocab = list(set(countList.index[:truncate[2]]).union(set(tfidfList.index[:truncate[2]])))\n",
    "#         vocab = list(tfidfList.index[:truncate[2]])\n",
    "        remove = list(set(count.get_feature_names()) ^ set(fullVocab))\n",
    "        \n",
    "    else:\n",
    "        vocab = list(set(countList.index).union(set(tfidfList.index)))\n",
    "        remove = list(set(count.get_feature_names()) ^ set(fullVocab))\n",
    "        \n",
    "    vocab = vocab + controlVocab\n",
    "    vocab = set(list(np.unique(vocab)))\n",
    "        \n",
    "    newTexts = pd.Series()\n",
    "    for i in texts.index:\n",
    "        k = re.split(string=texts.loc[i],pattern=\"\\W\")\n",
    "        j = []\n",
    "        for m in k:\n",
    "            if m in vocab:\n",
    "                j.append(m)\n",
    "\n",
    "        newTexts.loc[i] = \" \".join(j)\n",
    "    \n",
    "    tokenizedTexts,gendict = getTextsTokens(df=df,corpus=newTexts,model=model)    \n",
    "    \n",
    "    df.loc[newTexts.index,'filteredTexts'] = newTexts\n",
    "    df.loc[tokenizedTexts.index,'tokenizedTexts'] = tokenizedTexts\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Truncating by TFIDF, maybe'.format(et))              \n",
    "    return(df,countDF,countList,tfidfList,remove,vocab,gendict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getLDAmatrix(df,dictionary,collectTags,random):\n",
    "    st = time.time()\n",
    "    \n",
    "    rand = random\n",
    "    %set_env PYTHONHASHSEED=rand\n",
    "    \n",
    "    df = df.copy()\n",
    "    tokenizedTexts = list(df['tokenizedTexts'])\n",
    "        \n",
    "    tokenizedList_bow = [dictionary.doc2bow(x) for x in tokenizedTexts]\n",
    "    \n",
    "    ntop = 75\n",
    "    iterations = 100\n",
    "    \n",
    "    ldaModel = LdaModel(\n",
    "        corpus=tokenizedList_bow,\n",
    "        id2word=dictionary, \n",
    "        num_topics=ntop, \n",
    "        random_state=rand,\n",
    "        chunksize=2000,\n",
    "        passes=5,\n",
    "        update_every=1,\n",
    "        alpha=1/ntop,\n",
    "        eta=1/ntop, \n",
    "        decay=0.7,\n",
    "        offset=1.0, \n",
    "        eval_every=100, \n",
    "        iterations=iterations, \n",
    "        gamma_threshold=0.001,\n",
    "        minimum_probability=0.01,\n",
    "        minimum_phi_value=0.01\n",
    "    )\n",
    "    \n",
    "    \n",
    "    ldaDocMatrix = [i for i in ldaModel.get_document_topics(tokenizedList_bow)]\n",
    "    ldaDF = pd.DataFrame(corpus2dense(ldaDocMatrix, num_terms=ldaModel.num_topics).transpose(), index=df.index)\n",
    "    \n",
    "    if collectTags:\n",
    "        counter = CountVectorizer(binary=True)\n",
    "\n",
    "        tagstrings = df['manualTags'].str.join(\" \")    \n",
    "        tagDF = pd.DataFrame(counter.fit_transform(tagstrings).todense(),\n",
    "                             columns=counter.get_feature_names(),\n",
    "                             index=df.index)\n",
    "\n",
    "        ldacolumntags = counter.get_feature_names()\n",
    "        ldaDF = ldaDF.merge(tagDF,left_index=True,right_index=True,suffixes=(False,False))\n",
    "    else:\n",
    "        ldacolumntags = None\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Getting LDA Model / DF'.format(et))\n",
    "    return(ldaModel, ldaDF, ldacolumntags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "def doc2vecModel(df,random):\n",
    "    st = time.time()\n",
    "    df = df.copy()\n",
    "    \n",
    "    rand = random\n",
    "    %set_env PYTHONHASHSEED=rand\n",
    "    \n",
    "    documentsTrain = [TaggedDocument(df.loc[i,'tokenizedTexts'], (\n",
    "        [str(df.loc[i,'Id'])]+df.loc[i,'manualTags'])) for i in df.index]\n",
    "    \n",
    "    iterations = 100\n",
    "    \n",
    "    %set_env PYTHONHASHSEED=rand\n",
    "    model = Doc2Vec(\n",
    "        documentsTrain,\n",
    "        dm=0,\n",
    "        dbow_words=1,\n",
    "        vector_size=75, \n",
    "        window=5, \n",
    "        min_count=1, \n",
    "        workers=1,\n",
    "        seed=rand,\n",
    "        epocs=iterations,\n",
    "        hs=1,\n",
    "        negative=0\n",
    "               )\n",
    "    \n",
    "#     vectorDF = pd.DataFrame(model.docvecs.doctag_syn0, index=list(model.docvecs.doctags.keys()))\n",
    "#     vectorDF = vectorDF.sort_index()\n",
    "#     vectorDF = vectorDF[:len(df.index)].copy()\n",
    "    \n",
    "#     docid = [int(z[2:]) for z in vectorDF.index]\n",
    "#     vectorDF.index = docid\n",
    "#     vectorDF = vectorDF.sort_index()\n",
    "    %set_env PYTHONHASHSEED=rand\n",
    "    indic = []\n",
    "    dat = []\n",
    "    for i in df.index:\n",
    "#         print(i)\n",
    "        indic.append(i)\n",
    "#         dat.append(model.infer_vector((df.loc[i,'tokenizedTexts']+df.loc[i,'manualTags']), steps=iterations))\n",
    "        dat.append(model.infer_vector((df.loc[i,'tokenizedTexts']), steps=iterations))\n",
    "        \n",
    "    vectorDF = pd.DataFrame(dat,index=indic)\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Getting D2V Model / DF'.format(et))\n",
    "    return(model, vectorDF)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainModel(df,reps,targetLabels,random,tagState):\n",
    "    st = time.time()\n",
    "    df = df.copy()\n",
    "    reps = reps\n",
    "    rand = random\n",
    "    %set_env PYTHONHASHSEED=rand\n",
    "    \n",
    "    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()\n",
    "    \n",
    "    indices = labels.index\n",
    "    X = reps.loc[indices,:]\n",
    "    y = labels\n",
    "    \n",
    "    neural_model = MLPClassifier(hidden_layer_sizes=(10,),max_iter=100,random_state=rand)\n",
    "    neural_model.fit(X,y)\n",
    "    yprob = neural_model.predict_proba(X)[:,1]\n",
    "    yprob_pred = neural_model.predict(X)\n",
    "    RocAucScore = roc_auc_score(y_true=y,y_score=yprob)\n",
    "    \n",
    "    \n",
    "    listIndex = []\n",
    "    notIndex = []\n",
    "    if tagState:\n",
    "        for i in df.loc[labels.index,].index:\n",
    "            if len(df.loc[i,'manualTags']) > 0:\n",
    "                listIndex.append(int(i))\n",
    "            else:\n",
    "                notIndex.append(int(i))\n",
    "    \n",
    "        Xlist = representation.loc[listIndex,:]\n",
    "        y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n",
    "        yprob_pred_list = neural_model.predict(Xlist)        \n",
    "        \n",
    "        RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n",
    "        df.loc[listIndex,'PredictProbList'] = y_prob_list\n",
    "        df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n",
    "\n",
    "        \n",
    "        \n",
    "        Xlist = representation.loc[notIndex,:]\n",
    "        y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n",
    "        yprob_pred_list = neural_model.predict(Xlist)\n",
    "        \n",
    "        RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n",
    "        df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n",
    "        df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list\n",
    "    \n",
    "    \n",
    "    \n",
    "    # DEFAULT STATE\n",
    "    else:\n",
    "        RocAucScore_list = RocAucScore\n",
    "        RocAucScore_NOTlist = RocAucScore\n",
    "        df.loc[labels.index,'PredictProbList'] = yprob\n",
    "        df.loc[labels.index,'PredictPredList'] = yprob_pred\n",
    "    \n",
    "    \n",
    "    df.loc[labels.index,'PredictProb'] = yprob\n",
    "    df.loc[labels.index,'PredictPred'] = yprob_pred\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Running MLP Fit and Eval'.format(et))\n",
    "    return(neural_model,RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "def testModel(df,targetLabels,modelType,dictionary,neural_model,collectTags,ldajoincolumns,tagState):\n",
    "    st = time.time()\n",
    "    \n",
    "    df = df.copy()\n",
    "    \n",
    "    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()\n",
    "    df_test = df.loc[labels.index,].copy()\n",
    "    \n",
    "    if modelType == 'lda':\n",
    "        tokenizedTexts = list(df_test['tokenizedTexts'])\n",
    "        tokenizedList_bow = [dictionary.doc2bow(x) for x in tokenizedTexts]\n",
    "        \n",
    "        ldaDocMatrix = [i for i in ldaModel.get_document_topics(tokenizedList_bow)]\n",
    "        representation = pd.DataFrame(corpus2dense(ldaDocMatrix, num_terms=ldaModel.num_topics).transpose(), index=df_test.index)\n",
    "        \n",
    "        \n",
    "        if collectTags:\n",
    "            counter = CountVectorizer(vocabulary=ldajoincolumns,binary=True)\n",
    "            tagstrings = df_test['manualTags'].str.join(\" \")\n",
    "            tagDF = pd.DataFrame(counter.fit_transform(tagstrings).todense(),\n",
    "                                 columns=ldajoincolumns,\n",
    "                                 index=df_test.index)\n",
    "            representation = representation.merge(tagDF,left_index=True,right_index=True,suffixes=(False,False))\n",
    "            \n",
    "    \n",
    "    if modelType == 'd2v':\n",
    "        indic = []\n",
    "        dat = []\n",
    "        for i in df_test.index:\n",
    "#             print(i)\n",
    "            indic.append(i)\n",
    "#             dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']+df_test.loc[i,'manualTags']), steps=100))\n",
    "            dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']), steps=100))\n",
    "\n",
    "        representation = pd.DataFrame(dat,index=indic)\n",
    "    \n",
    "    X = representation\n",
    "    yTest_prob = neural_model.predict_proba(X)[:,1]\n",
    "    yTest_pred = neural_model.predict(X)\n",
    "    RocAucScore = roc_auc_score(y_true=labels,y_score=yTest_prob)\n",
    "    \n",
    "    listIndex = []\n",
    "    notIndex = []\n",
    "    if tagState:\n",
    "        for i in df.loc[labels.index,].index:\n",
    "            if len(df.loc[i,'manualTags']) > 0:\n",
    "                listIndex.append(int(i))\n",
    "            else:\n",
    "                notIndex.append(int(i))\n",
    "    \n",
    "        Xlist = representation.loc[listIndex,:]\n",
    "        y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n",
    "        yprob_pred_list = neural_model.predict(Xlist)        \n",
    "        \n",
    "        RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n",
    "        df.loc[listIndex,'PredictProbList'] = y_prob_list\n",
    "        df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n",
    "\n",
    "        \n",
    "        \n",
    "        Xlist = representation.loc[notIndex,:]\n",
    "        y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n",
    "        yprob_pred_list = neural_model.predict(Xlist)\n",
    "        \n",
    "        RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n",
    "        df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n",
    "        df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list\n",
    "    \n",
    "    \n",
    "    \n",
    "    # DEFAULT STATE\n",
    "    else:\n",
    "        RocAucScore_list = RocAucScore\n",
    "        RocAucScore_NOTlist = RocAucScore\n",
    "        df.loc[labels.index,'PredictProbList'] = yTest_prob\n",
    "        df.loc[labels.index,'PredictPredList'] = yTest_pred\n",
    "    \n",
    "    \n",
    "    df.loc[labels.index,'PredictProb'] = yTest_prob\n",
    "    df.loc[labels.index,'PredictPred'] = yTest_pred\n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Testing Model with Holdout Data'.format(et))\n",
    "    return(RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepareTestingData(df,vocab,section):\n",
    "    st = time.time()\n",
    "    \n",
    "    df = df.copy()    \n",
    "    df['DocTags'] = [[] for i in range(df.shape[0])] \n",
    "    \n",
    "    \n",
    "    splitStrings = df.loc[:,section].str.lower()\n",
    "    splitStrings = splitStrings.str.split(\"\\W\")\n",
    "    \n",
    "    testTokens = pd.Series()\n",
    "    for i in splitStrings.index:\n",
    "        j = []\n",
    "        for m in splitStrings.loc[i]:\n",
    "            if m in vocab:\n",
    "                j.append(m)\n",
    "\n",
    "        testTokens.loc[i] = j\n",
    "    df.loc[testTokens.index,'tokenizedTexts'] = testTokens\n",
    "    \n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Preparing Test Holdout Data'.format(et))\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getFocusedROCAUC(df_test,df_train,qtype,labelledsection,tagState):\n",
    "    st = time.time()\n",
    "    df_test = df_test.copy()\n",
    "    df_train = df_train.copy()\n",
    "    \n",
    "    df_train = df_train.loc[df_train['READ']!='Unknown',]\n",
    "    ytrue = df_train.loc[df_train['Question Type']==qtype,labelledSection]\n",
    "    ypred = df_train.loc[df_train['Question Type']==qtype,'PredictProb']\n",
    "    trainRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)\n",
    "    \n",
    "    df_test = df_test.loc[df_test['READ']!='Unknown',]\n",
    "    ytrue = df_test.loc[df_test['Question Type']==qtype,labelledSection]\n",
    "    ypred = df_test.loc[df_test['Question Type']==qtype,'PredictProb']\n",
    "    testRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)\n",
    "    \n",
    "    if tagState:\n",
    "        trainList = []\n",
    "        for i in df_train.index:\n",
    "            if len(df_train.loc[i,'manualTags']) > 0:\n",
    "                trainList.append(i)\n",
    "        testList = []\n",
    "        for i in df_test.index:\n",
    "            if len(df_test.loc[i,'manualTags']) > 0:\n",
    "                trainList.append(i)\n",
    "    \n",
    "#         trainList = list(set(consolidatedList).intersection(set(df_train.index)))\n",
    "#         testList = list(set(consolidatedList).intersection(set(df_test.index)))\n",
    "\n",
    "        df_train_list = df_train.loc[trainList,].copy()\n",
    "        ytrue_list = df_train_list.loc[df_train_list['Question Type']==qtype,labelledSection]\n",
    "        ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictProbList']\n",
    "        try:\n",
    "            trainRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_score=ypred_list)\n",
    "        except:\n",
    "            ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictPred']\n",
    "            trainRocAucScore_list = \"Acc_\" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))\n",
    "\n",
    "\n",
    "        df_test_list = df_test.loc[testList,].copy()\n",
    "        ytrue_list = df_test_list.loc[df_test_list['Question Type']==qtype,labelledSection]\n",
    "        ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictProbList']\n",
    "        try:\n",
    "            testRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_pred=ypred_list)\n",
    "        except:\n",
    "            ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictPredList']\n",
    "            testRocAucScore_list = \"Acc_\" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))\n",
    "        trainListShape = len(trainList)\n",
    "        testListShape = len(testList)\n",
    "\n",
    "        \n",
    "    else:\n",
    "        trainRocAucScore_list = None\n",
    "        testRocAucScore_list = None\n",
    "        trainListShape = 0\n",
    "        testListShape = 0\n",
    "    \n",
    "    \n",
    "    et = time.time() - st\n",
    "    print('{:.2f} : Getting Focused ROC-AUC Scores ({})'.format(et,qtype))\n",
    "    return(trainRocAucScore,testRocAucScore,trainRocAucScore_list,testRocAucScore_list,trainListShape,testListShape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "# custom vocab builder\n",
    "customVocab = []\n",
    "\n",
    "# random states\n",
    "randomOptions = list(np.arange(0,20,1))\n",
    "# randomOptions = [0]\n",
    "\n",
    "#test Split\n",
    "\n",
    "testingSplits = [\n",
    "    2000,\n",
    "]\n",
    "\n",
    "\n",
    "manualTagOptions = [\n",
    "    True,\n",
    "    False,\n",
    "]\n",
    "rollupsOptions = [\n",
    "    True,\n",
    "    False,\n",
    "]\n",
    "\n",
    "\n",
    "#tfidf limits\n",
    "dictOptions = [    \n",
    "    (False, 1, 300000),\n",
    "    (True, 2, 3000),\n",
    "]\n",
    "\n",
    "#which patron section to look at\n",
    "patronSectionOptions = [\n",
    "    'First10',\n",
    "    'First20',\n",
    "]\n",
    "\n",
    "#labels\n",
    "labelledSectionOptions = [\n",
    "    'READ_1_vs_2',\n",
    "    'READ_2_vs_3',\n",
    "]\n",
    "\n",
    "#model\n",
    "modelOptions = [\n",
    "    'lda',\n",
    "    'd2v',\n",
    "]\n",
    "\n",
    "\n",
    "optionsList = [\n",
    "    randomOptions,\n",
    "    manualTagOptions,\n",
    "    rollupsOptions,\n",
    "    dictOptions,\n",
    "    patronSectionOptions,\n",
    "    labelledSectionOptions,\n",
    "    modelOptions,\n",
    "]\n",
    "\n",
    "optionsLen = [len(x) for x in optionsList]\n",
    "\n",
    "testtotal = np.prod(optionsLen)\n",
    "iterationcounter = 0\n",
    "\n",
    "parameterList = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for z in tqdm.tqdm(randomOptions):\n",
    "    randomSeed=z\n",
    "\n",
    "\n",
    "    for s in testingSplits:\n",
    "        split = s\n",
    "        testsplit = rawDdata.shape[0]-split\n",
    "        trainData = rawDdata[:testsplit]\n",
    "        testData = rawDdata[testsplit:]\n",
    "        \n",
    "        trainShape = trainData.shape[0]\n",
    "        testShape = testData.shape[0]\n",
    "\n",
    "        for e in patronSectionOptions:\n",
    "            patronSection=e \n",
    "\n",
    "            for f in manualTagOptions:\n",
    "                getTags = f\n",
    "\n",
    "                for r in rollupsOptions:\n",
    "                    if getTags == True:\n",
    "                        rollup = r\n",
    "                    elif getTags == False:\n",
    "                        if r == True:\n",
    "                            continue\n",
    "                        else:\n",
    "                            rollup = r\n",
    "                    trainDataTagged = getManualTags(df=trainData,\n",
    "                                                    collectTags=getTags,\n",
    "                                                    manualTagsList=manualTags,\n",
    "                                                    section=patronSection,\n",
    "                                                    rollups=rollup,\n",
    "                                                   )\n",
    "                    testDataTagged = getManualTags(df=testData,\n",
    "                                                   collectTags=getTags,\n",
    "                                                   manualTagsList=manualTags,\n",
    "                                                   section=patronSection,\n",
    "                                                   rollups=rollup,\n",
    "                                                  )\n",
    "\n",
    "                    for g in labelledSectionOptions:\n",
    "                        labelledSection=g\n",
    "\n",
    "                        for h in modelOptions:\n",
    "                            model=h              \n",
    "\n",
    "                            for m in dictOptions:\n",
    "                                truncate=m \n",
    "\n",
    "                                start_time = time.time()\n",
    "\n",
    "                                trainDataTRUNC,countDF,countList,tfidfList,remove,vocab,gendict = getTFIDFlimited(\n",
    "                                    df=trainDataTagged,\n",
    "                                    truncate=truncate,\n",
    "                                    section=patronSection,\n",
    "                                    controlVocab=customVocab,\n",
    "                                    model=model,\n",
    "                                )\n",
    "       \n",
    "\n",
    "                                if model == 'lda':\n",
    "                                    ldaModel, representation, ldacolumntags = getLDAmatrix(\n",
    "                                        df=trainDataTRUNC,\n",
    "                                        dictionary=gendict,\n",
    "                                        collectTags=getTags,\n",
    "                                        random=randomSeed)\n",
    "\n",
    "                                if model == 'd2v':\n",
    "                                    d2vModel, representation = doc2vecModel(\n",
    "                                        df=trainDataTRUNC,\n",
    "                                        random=randomSeed)                                                \n",
    "                                    ldacolumntags=None\n",
    "\n",
    "\n",
    "\n",
    "                                MLPmodel,AUCscore,AUCscorelist,AUCscoreNOTlist,trainLlen,trainNLlen,trainDataFinal,trainFinalRep = trainModel(\n",
    "                                    df=trainDataTRUNC,\n",
    "                                    reps=representation,\n",
    "                                    targetLabels=labelledSection,\n",
    "                                    random=randomSeed,\n",
    "                                    tagState=getTags,)\n",
    "\n",
    "\n",
    "                                testDataPrepped = prepareTestingData(\n",
    "                                    df=testDataTagged,\n",
    "                                    vocab=vocab,\n",
    "                                    section=patronSection,)\n",
    "\n",
    "\n",
    "                                testAUC,testAUClist,testAUCNOTlist,testLlen,testNLlen,testDataFinal,testFinalRep = testModel(\n",
    "                                    df=testDataPrepped,\n",
    "                                    targetLabels=labelledSection,\n",
    "                                    modelType=model,\n",
    "                                    dictionary=gendict,\n",
    "                                    neural_model=MLPmodel,\n",
    "                                    ldajoincolumns=ldacolumntags,\n",
    "                                    collectTags=getTags,\n",
    "                                    tagState=getTags,)\n",
    "    \n",
    "\n",
    "\n",
    "                                end_time = time.time()\n",
    "                                total_time = end_time-start_time\n",
    "\n",
    "                                modelParameters = {\n",
    "                                    \n",
    "                                    'TRUNC':patronSection,\n",
    "                                    'O-Core':getTags,\n",
    "                                    'O-Core+Super':rollup,\n",
    "                                    'DICT':str(truncate),\n",
    "                                    'REPRESENT':model,\n",
    "                                    'READ':labelledSection,\n",
    "                                    'AUC_train':AUCscore,\n",
    "                                    'AUC_test':testAUC,\n",
    "\n",
    "                                    'AUC_train_Onto':AUCscorelist,\n",
    "                                    'AUC_test_Onto':testAUClist,\n",
    "                                    'AUC_train_Not_Onto':AUCscoreNOTlist,\n",
    "                                    'AUC_test_Not_Onto':testAUCNOTlist,\n",
    "\n",
    "                                    'AUC_train_Onto_LEN':trainLlen,\n",
    "                                    'AUC_train_OntoN_LEN':trainNLlen,\n",
    "                                    'AUC_test_Onto_LEN':testLlen,\n",
    "                                    'AUC_test_OntoN_LEN':testNLlen,\n",
    "                                    \n",
    "                                    \n",
    "                                    'CycleTime':total_time,\n",
    "                                    'RAND':randomSeed,\n",
    "                                    'Custom Vocab Len':len(customVocab),\n",
    "                                    'Full Vocab Len':len(vocab),\n",
    "                                    'Testing Split':testsplit,\n",
    "                                    'Training Data Full':trainShape,\n",
    "                                    'Testing Data Full':testShape,\n",
    "                                    'Training Data Final':trainDataFinal.shape[0],\n",
    "                                    'Testing Data Final':testDataFinal.shape[0],\n",
    "                                                  }\n",
    "\n",
    "                                parameterList.append(list(modelParameters.values()))\n",
    "                                print(modelParameters.values())\n",
    "                                iterationcounter += 1\n",
    "                                print(\"{} out of {} complete\".format(iterationcounter,testtotal))\n",
    "\n",
    "parameterDataFrame = pd.DataFrame(parameterList,columns=list(modelParameters.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TRUNC</th>\n",
       "      <th>O-Core</th>\n",
       "      <th>O-Core+Super</th>\n",
       "      <th>DICT</th>\n",
       "      <th>REPRESENT</th>\n",
       "      <th>READ</th>\n",
       "      <th>AUC_train</th>\n",
       "      <th>AUC_test</th>\n",
       "      <th>AUC_train_Onto</th>\n",
       "      <th>AUC_test_Onto</th>\n",
       "      <th>...</th>\n",
       "      <th>AUC_test_OntoN_LEN</th>\n",
       "      <th>CycleTime</th>\n",
       "      <th>RAND</th>\n",
       "      <th>Custom Vocab Len</th>\n",
       "      <th>Full Vocab Len</th>\n",
       "      <th>Testing Split</th>\n",
       "      <th>Training Data Full</th>\n",
       "      <th>Testing Data Full</th>\n",
       "      <th>Training Data Final</th>\n",
       "      <th>Testing Data Final</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.712627</td>\n",
       "      <td>0.658789</td>\n",
       "      <td>0.816980</td>\n",
       "      <td>0.729856</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>36.252905</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(True, 2, 3000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.706274</td>\n",
       "      <td>0.662781</td>\n",
       "      <td>0.813895</td>\n",
       "      <td>0.731202</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.376017</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>d2v</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.726060</td>\n",
       "      <td>0.681149</td>\n",
       "      <td>0.798695</td>\n",
       "      <td>0.726875</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>31.990057</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(True, 2, 3000)</td>\n",
       "      <td>d2v</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.709961</td>\n",
       "      <td>0.670547</td>\n",
       "      <td>0.791796</td>\n",
       "      <td>0.730337</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>25.855720</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_2_vs_3</td>\n",
       "      <td>0.703218</td>\n",
       "      <td>0.644213</td>\n",
       "      <td>0.824265</td>\n",
       "      <td>0.768131</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>35.527993</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     TRUNC  O-Core  O-Core+Super                DICT REPRESENT         READ  \\\n",
       "0  First10    True          True  (False, 1, 300000)       lda  READ_1_vs_2   \n",
       "1  First10    True          True     (True, 2, 3000)       lda  READ_1_vs_2   \n",
       "2  First10    True          True  (False, 1, 300000)       d2v  READ_1_vs_2   \n",
       "3  First10    True          True     (True, 2, 3000)       d2v  READ_1_vs_2   \n",
       "4  First10    True          True  (False, 1, 300000)       lda  READ_2_vs_3   \n",
       "\n",
       "   AUC_train  AUC_test  AUC_train_Onto  AUC_test_Onto         ...          \\\n",
       "0   0.712627  0.658789        0.816980       0.729856         ...           \n",
       "1   0.706274  0.662781        0.813895       0.731202         ...           \n",
       "2   0.726060  0.681149        0.798695       0.726875         ...           \n",
       "3   0.709961  0.670547        0.791796       0.730337         ...           \n",
       "4   0.703218  0.644213        0.824265       0.768131         ...           \n",
       "\n",
       "   AUC_test_OntoN_LEN  CycleTime  RAND  Custom Vocab Len  Full Vocab Len  \\\n",
       "0                1413  36.252905     0                 0            7422   \n",
       "1                1413  28.376017     0                 0            2967   \n",
       "2                1413  31.990057     0                 0            7422   \n",
       "3                1413  25.855720     0                 0            2967   \n",
       "4                1413  35.527993     0                 0            7422   \n",
       "\n",
       "   Testing Split  Training Data Full  Testing Data Full  Training Data Final  \\\n",
       "0          12604               12604               2000                10162   \n",
       "1          12604               12604               2000                10162   \n",
       "2          12604               12604               2000                10162   \n",
       "3          12604               12604               2000                10162   \n",
       "4          12604               12604               2000                10162   \n",
       "\n",
       "   Testing Data Final  \n",
       "0                1753  \n",
       "1                1753  \n",
       "2                1753  \n",
       "3                1753  \n",
       "4                1753  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameterDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n",
       "       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n",
       "       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n",
       "       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n",
       "       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n",
       "       'Testing Split', 'Training Data Full', 'Testing Data Full',\n",
       "       'Training Data Final', 'Testing Data Final'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameterDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TRUNC</th>\n",
       "      <th>O-Core</th>\n",
       "      <th>O-Core+Super</th>\n",
       "      <th>DICT</th>\n",
       "      <th>REPRESENT</th>\n",
       "      <th>READ</th>\n",
       "      <th>AUC_train</th>\n",
       "      <th>AUC_test</th>\n",
       "      <th>AUC_train_Onto</th>\n",
       "      <th>AUC_test_Onto</th>\n",
       "      <th>...</th>\n",
       "      <th>AUC_test_OntoN_LEN</th>\n",
       "      <th>CycleTime</th>\n",
       "      <th>RAND</th>\n",
       "      <th>Custom Vocab Len</th>\n",
       "      <th>Full Vocab Len</th>\n",
       "      <th>Testing Split</th>\n",
       "      <th>Training Data Full</th>\n",
       "      <th>Testing Data Full</th>\n",
       "      <th>Training Data Final</th>\n",
       "      <th>Testing Data Final</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.712627</td>\n",
       "      <td>0.658789</td>\n",
       "      <td>0.816980</td>\n",
       "      <td>0.729856</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>36.252905</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(True, 2, 3000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.706274</td>\n",
       "      <td>0.662781</td>\n",
       "      <td>0.813895</td>\n",
       "      <td>0.731202</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.376017</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>d2v</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.726060</td>\n",
       "      <td>0.681149</td>\n",
       "      <td>0.798695</td>\n",
       "      <td>0.726875</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>31.990057</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(True, 2, 3000)</td>\n",
       "      <td>d2v</td>\n",
       "      <td>READ_1_vs_2</td>\n",
       "      <td>0.709961</td>\n",
       "      <td>0.670547</td>\n",
       "      <td>0.791796</td>\n",
       "      <td>0.730337</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>25.855720</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>First10</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>(False, 1, 300000)</td>\n",
       "      <td>lda</td>\n",
       "      <td>READ_2_vs_3</td>\n",
       "      <td>0.703218</td>\n",
       "      <td>0.644213</td>\n",
       "      <td>0.824265</td>\n",
       "      <td>0.768131</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>35.527993</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     TRUNC  O-Core  O-Core+Super                DICT REPRESENT         READ  \\\n",
       "0  First10    True          True  (False, 1, 300000)       lda  READ_1_vs_2   \n",
       "1  First10    True          True     (True, 2, 3000)       lda  READ_1_vs_2   \n",
       "2  First10    True          True  (False, 1, 300000)       d2v  READ_1_vs_2   \n",
       "3  First10    True          True     (True, 2, 3000)       d2v  READ_1_vs_2   \n",
       "4  First10    True          True  (False, 1, 300000)       lda  READ_2_vs_3   \n",
       "\n",
       "   AUC_train  AUC_test  AUC_train_Onto  AUC_test_Onto         ...          \\\n",
       "0   0.712627  0.658789        0.816980       0.729856         ...           \n",
       "1   0.706274  0.662781        0.813895       0.731202         ...           \n",
       "2   0.726060  0.681149        0.798695       0.726875         ...           \n",
       "3   0.709961  0.670547        0.791796       0.730337         ...           \n",
       "4   0.703218  0.644213        0.824265       0.768131         ...           \n",
       "\n",
       "   AUC_test_OntoN_LEN  CycleTime  RAND  Custom Vocab Len  Full Vocab Len  \\\n",
       "0                1413  36.252905     0                 0            7422   \n",
       "1                1413  28.376017     0                 0            2967   \n",
       "2                1413  31.990057     0                 0            7422   \n",
       "3                1413  25.855720     0                 0            2967   \n",
       "4                1413  35.527993     0                 0            7422   \n",
       "\n",
       "   Testing Split  Training Data Full  Testing Data Full  Training Data Final  \\\n",
       "0          12604               12604               2000                10162   \n",
       "1          12604               12604               2000                10162   \n",
       "2          12604               12604               2000                10162   \n",
       "3          12604               12604               2000                10162   \n",
       "4          12604               12604               2000                10162   \n",
       "\n",
       "   Testing Data Final  \n",
       "0                1753  \n",
       "1                1753  \n",
       "2                1753  \n",
       "3                1753  \n",
       "4                1753  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameterDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AUC_train_Not_Onto</th>\n",
       "      <th>AUC_test_Not_Onto</th>\n",
       "      <th>AUC_train_Onto_LEN</th>\n",
       "      <th>AUC_train_OntoN_LEN</th>\n",
       "      <th>AUC_test_Onto_LEN</th>\n",
       "      <th>AUC_test_OntoN_LEN</th>\n",
       "      <th>CycleTime</th>\n",
       "      <th>RAND</th>\n",
       "      <th>Custom Vocab Len</th>\n",
       "      <th>Full Vocab Len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.680720</td>\n",
       "      <td>0.637510</td>\n",
       "      <td>1832</td>\n",
       "      <td>8330</td>\n",
       "      <td>340</td>\n",
       "      <td>1413</td>\n",
       "      <td>36.252905</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.673145</td>\n",
       "      <td>0.644253</td>\n",
       "      <td>1832</td>\n",
       "      <td>8330</td>\n",
       "      <td>340</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.376017</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.704060</td>\n",
       "      <td>0.665981</td>\n",
       "      <td>1832</td>\n",
       "      <td>8330</td>\n",
       "      <td>340</td>\n",
       "      <td>1413</td>\n",
       "      <td>31.990057</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.684704</td>\n",
       "      <td>0.652069</td>\n",
       "      <td>1832</td>\n",
       "      <td>8330</td>\n",
       "      <td>340</td>\n",
       "      <td>1413</td>\n",
       "      <td>25.855720</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.670593</td>\n",
       "      <td>0.602206</td>\n",
       "      <td>1832</td>\n",
       "      <td>8330</td>\n",
       "      <td>340</td>\n",
       "      <td>1413</td>\n",
       "      <td>35.527993</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   AUC_train_Not_Onto  AUC_test_Not_Onto  AUC_train_Onto_LEN  \\\n",
       "0            0.680720           0.637510                1832   \n",
       "1            0.673145           0.644253                1832   \n",
       "2            0.704060           0.665981                1832   \n",
       "3            0.684704           0.652069                1832   \n",
       "4            0.670593           0.602206                1832   \n",
       "\n",
       "   AUC_train_OntoN_LEN  AUC_test_Onto_LEN  AUC_test_OntoN_LEN  CycleTime  \\\n",
       "0                 8330                340                1413  36.252905   \n",
       "1                 8330                340                1413  28.376017   \n",
       "2                 8330                340                1413  31.990057   \n",
       "3                 8330                340                1413  25.855720   \n",
       "4                 8330                340                1413  35.527993   \n",
       "\n",
       "   RAND  Custom Vocab Len  Full Vocab Len  \n",
       "0     0                 0            7422  \n",
       "1     0                 0            2967  \n",
       "2     0                 0            7422  \n",
       "3     0                 0            2967  \n",
       "4     0                 0            7422  "
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameterDataFrame.iloc[:,10:20].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "parameterDataFrame.to_csv('{}_Run.csv'.format(today).format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "preservedDataFrame = pd.read_csv('{}_Run.csv'.format(today),index_col=0)\n",
    "preservedDataFrame.to_csv('{}_preserveRun.csv'.format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "preservedDataFrame = pd.read_csv('{}_preserveRun.csv'.format(today),index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(96, 25)"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n",
       "       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n",
       "       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n",
       "       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n",
       "       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n",
       "       'Testing Split', 'Training Data Full', 'Testing Data Full',\n",
       "       'Training Data Final', 'Testing Data Final'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "preservedDataFrame['O-Core'] = np.where(preservedDataFrame['O-Core'], 1, 0)\n",
    "preservedDataFrame['O-Core+Super'] = np.where(preservedDataFrame['O-Core+Super'], 1, 0)\n",
    "preservedDataFrame['TRUNC'] = np.where(preservedDataFrame['TRUNC'] == 'First20', 2,\n",
    "                                                 np.where(preservedDataFrame['TRUNC'] == 'First10', 1, 0))\n",
    "preservedDataFrame['DICT'] = np.where(preservedDataFrame['DICT'] == \"\"\"(True, 2, 3000)\"\"\", 1, 0)\n",
    "\n",
    "preservedDataFrame['READ'] = np.where(preservedDataFrame['READ'] == 'READ_1_vs_2', 0, 1)\n",
    "preservedDataFrame['REPRESENT'] = np.where(preservedDataFrame['REPRESENT'] == 'lda', 0, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n",
       "       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n",
       "       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n",
       "       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n",
       "       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n",
       "       'Testing Split', 'Training Data Full', 'Testing Data Full',\n",
       "       'Training Data Final', 'Testing Data Final'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,\n",
       "            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,\n",
       "            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,\n",
       "            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,\n",
       "            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,\n",
       "            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95],\n",
       "           dtype='int64')"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TRUNC</th>\n",
       "      <th>O-Core</th>\n",
       "      <th>O-Core+Super</th>\n",
       "      <th>DICT</th>\n",
       "      <th>REPRESENT</th>\n",
       "      <th>READ</th>\n",
       "      <th>AUC_train</th>\n",
       "      <th>AUC_test</th>\n",
       "      <th>AUC_train_Onto</th>\n",
       "      <th>AUC_test_Onto</th>\n",
       "      <th>...</th>\n",
       "      <th>AUC_test_OntoN_LEN</th>\n",
       "      <th>CycleTime</th>\n",
       "      <th>RAND</th>\n",
       "      <th>Custom Vocab Len</th>\n",
       "      <th>Full Vocab Len</th>\n",
       "      <th>Testing Split</th>\n",
       "      <th>Training Data Full</th>\n",
       "      <th>Testing Data Full</th>\n",
       "      <th>Training Data Final</th>\n",
       "      <th>Testing Data Final</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.712627</td>\n",
       "      <td>0.658789</td>\n",
       "      <td>0.816980</td>\n",
       "      <td>0.729856</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>36.252905</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.706274</td>\n",
       "      <td>0.662781</td>\n",
       "      <td>0.813895</td>\n",
       "      <td>0.731202</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.376017</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.726060</td>\n",
       "      <td>0.681149</td>\n",
       "      <td>0.798695</td>\n",
       "      <td>0.726875</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>31.990057</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.709961</td>\n",
       "      <td>0.670547</td>\n",
       "      <td>0.791796</td>\n",
       "      <td>0.730337</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>25.855720</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.703218</td>\n",
       "      <td>0.644213</td>\n",
       "      <td>0.824265</td>\n",
       "      <td>0.768131</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>35.527993</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.697165</td>\n",
       "      <td>0.632639</td>\n",
       "      <td>0.822627</td>\n",
       "      <td>0.739773</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.378000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.723824</td>\n",
       "      <td>0.649519</td>\n",
       "      <td>0.782196</td>\n",
       "      <td>0.745600</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>30.565002</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.704080</td>\n",
       "      <td>0.647882</td>\n",
       "      <td>0.770538</td>\n",
       "      <td>0.716672</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>25.690000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.704254</td>\n",
       "      <td>0.658059</td>\n",
       "      <td>0.812978</td>\n",
       "      <td>0.716202</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>36.218996</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7422</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.701096</td>\n",
       "      <td>0.650616</td>\n",
       "      <td>0.813877</td>\n",
       "      <td>0.722212</td>\n",
       "      <td>...</td>\n",
       "      <td>1413</td>\n",
       "      <td>28.417001</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2967</td>\n",
       "      <td>12604</td>\n",
       "      <td>12604</td>\n",
       "      <td>2000</td>\n",
       "      <td>10162</td>\n",
       "      <td>1753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   TRUNC  O-Core  O-Core+Super  DICT  REPRESENT  READ  AUC_train  AUC_test  \\\n",
       "0      1       1             1     0          0     0   0.712627  0.658789   \n",
       "1      1       1             1     1          0     0   0.706274  0.662781   \n",
       "2      1       1             1     0          1     0   0.726060  0.681149   \n",
       "3      1       1             1     1          1     0   0.709961  0.670547   \n",
       "4      1       1             1     0          0     1   0.703218  0.644213   \n",
       "5      1       1             1     1          0     1   0.697165  0.632639   \n",
       "6      1       1             1     0          1     1   0.723824  0.649519   \n",
       "7      1       1             1     1          1     1   0.704080  0.647882   \n",
       "8      1       1             0     0          0     0   0.704254  0.658059   \n",
       "9      1       1             0     1          0     0   0.701096  0.650616   \n",
       "\n",
       "   AUC_train_Onto  AUC_test_Onto         ...          AUC_test_OntoN_LEN  \\\n",
       "0        0.816980       0.729856         ...                        1413   \n",
       "1        0.813895       0.731202         ...                        1413   \n",
       "2        0.798695       0.726875         ...                        1413   \n",
       "3        0.791796       0.730337         ...                        1413   \n",
       "4        0.824265       0.768131         ...                        1413   \n",
       "5        0.822627       0.739773         ...                        1413   \n",
       "6        0.782196       0.745600         ...                        1413   \n",
       "7        0.770538       0.716672         ...                        1413   \n",
       "8        0.812978       0.716202         ...                        1413   \n",
       "9        0.813877       0.722212         ...                        1413   \n",
       "\n",
       "   CycleTime  RAND  Custom Vocab Len  Full Vocab Len  Testing Split  \\\n",
       "0  36.252905     0                 0            7422          12604   \n",
       "1  28.376017     0                 0            2967          12604   \n",
       "2  31.990057     0                 0            7422          12604   \n",
       "3  25.855720     0                 0            2967          12604   \n",
       "4  35.527993     0                 0            7422          12604   \n",
       "5  28.378000     0                 0            2967          12604   \n",
       "6  30.565002     0                 0            7422          12604   \n",
       "7  25.690000     0                 0            2967          12604   \n",
       "8  36.218996     0                 0            7422          12604   \n",
       "9  28.417001     0                 0            2967          12604   \n",
       "\n",
       "   Training Data Full  Testing Data Full  Training Data Final  \\\n",
       "0               12604               2000                10162   \n",
       "1               12604               2000                10162   \n",
       "2               12604               2000                10162   \n",
       "3               12604               2000                10162   \n",
       "4               12604               2000                10162   \n",
       "5               12604               2000                10162   \n",
       "6               12604               2000                10162   \n",
       "7               12604               2000                10162   \n",
       "8               12604               2000                10162   \n",
       "9               12604               2000                10162   \n",
       "\n",
       "   Testing Data Final  \n",
       "0                1753  \n",
       "1                1753  \n",
       "2                1753  \n",
       "3                1753  \n",
       "4                1753  \n",
       "5                1753  \n",
       "6                1753  \n",
       "7                1753  \n",
       "8                1753  \n",
       "9                1753  \n",
       "\n",
       "[10 rows x 25 columns]"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(96, 25)"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n",
       "       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n",
       "       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n",
       "       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n",
       "       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n",
       "       'Testing Split', 'Training Data Full', 'Testing Data Full',\n",
       "       'Training Data Final', 'Testing Data Final'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "deltaColumns = [(\"D_\"+x) for x in preservedDataFrame.iloc[:,6:].columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "combos = list(itertools.combinations(list(preservedDataFrame.index),2))\n",
    "headers = ['TRUNC','O-Core','O-Core+Super','READ', 'REPRESENT','DICT','RAND']\n",
    "\n",
    "combolist = [list(x) for x in combos]\n",
    "\n",
    "pairwiseDataFrame = pd.DataFrame(columns=deltaColumns, index=range(len(combolist)))\n",
    "\n",
    "m1 = []\n",
    "m2 = []\n",
    "\n",
    "for i in combolist:\n",
    "    m1.append(i[0])\n",
    "    m2.append(i[1])\n",
    "    \n",
    "pairwiseDataFrame['M1'] = m1\n",
    "pairwiseDataFrame['M2'] = m2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4560, 21)"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRUNC\n",
      "O-Core\n",
      "O-Core+Super\n",
      "READ\n",
      "REPRESENT\n",
      "DICT\n",
      "RAND\n"
     ]
    }
   ],
   "source": [
    "for i in headers:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',\n",
       "       'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',\n",
       "       'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',\n",
       "       'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',\n",
       "       'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',\n",
       "       'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_RAND</th>\n",
       "      <th>D_Custom Vocab Len</th>\n",
       "      <th>D_Full Vocab Len</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto  \\\n",
       "0         NaN        NaN              NaN             NaN   \n",
       "1         NaN        NaN              NaN             NaN   \n",
       "2         NaN        NaN              NaN             NaN   \n",
       "3         NaN        NaN              NaN             NaN   \n",
       "4         NaN        NaN              NaN             NaN   \n",
       "\n",
       "  D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN  \\\n",
       "0                  NaN                 NaN                  NaN   \n",
       "1                  NaN                 NaN                  NaN   \n",
       "2                  NaN                 NaN                  NaN   \n",
       "3                  NaN                 NaN                  NaN   \n",
       "4                  NaN                 NaN                  NaN   \n",
       "\n",
       "  D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN ... D_RAND  \\\n",
       "0                   NaN                 NaN                  NaN ...    NaN   \n",
       "1                   NaN                 NaN                  NaN ...    NaN   \n",
       "2                   NaN                 NaN                  NaN ...    NaN   \n",
       "3                   NaN                 NaN                  NaN ...    NaN   \n",
       "4                   NaN                 NaN                  NaN ...    NaN   \n",
       "\n",
       "  D_Custom Vocab Len D_Full Vocab Len D_Testing Split D_Training Data Full  \\\n",
       "0                NaN              NaN             NaN                  NaN   \n",
       "1                NaN              NaN             NaN                  NaN   \n",
       "2                NaN              NaN             NaN                  NaN   \n",
       "3                NaN              NaN             NaN                  NaN   \n",
       "4                NaN              NaN             NaN                  NaN   \n",
       "\n",
       "  D_Testing Data Full D_Training Data Final D_Testing Data Final M1  M2  \n",
       "0                 NaN                   NaN                  NaN  0   1  \n",
       "1                 NaN                   NaN                  NaN  0   2  \n",
       "2                 NaN                   NaN                  NaN  0   3  \n",
       "3                 NaN                   NaN                  NaN  0   4  \n",
       "4                 NaN                   NaN                  NaN  0   5  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4560, 21)"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "breakpoints = np.arange(0,1900000,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([      0,    1000,    2000, ..., 1897000, 1898000, 1899000])"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "breakpoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for k in range(len(breakpoints)-1):\n",
    "    print(\"BREAKPOINT {}-{}\".format(breakpoints[k],breakpoints[k+1]))\n",
    "    \n",
    "    start = breakpoints[k]\n",
    "    end = breakpoints[k+1]    \n",
    "    \n",
    "    fseries = pd.Series()\n",
    "\n",
    "    for i in tqdm.tqdm(pairwiseDataFrame.index[start:end]):\n",
    "        m1 = pairwiseDataFrame.loc[i,'M1']\n",
    "        m2 = pairwiseDataFrame.loc[i,'M2']\n",
    "\n",
    "        factors = str()\n",
    "        counter = 0\n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[0]] != preservedDataFrame.loc[m2,headers[0]]:\n",
    "            factors = factors + headers[0] + \" \"\n",
    "            counter += 1  \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[1]] != preservedDataFrame.loc[m2,headers[1]]:\n",
    "            factors = factors + headers[1] + \" \"\n",
    "            counter += 1  \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[2]] != preservedDataFrame.loc[m2,headers[2]]:\n",
    "            factors = factors + headers[2] + \" \"\n",
    "            counter += 1   \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[3]] != preservedDataFrame.loc[m2,headers[3]]:\n",
    "            factors = factors + headers[3] + \" \"\n",
    "            counter += 1  \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[4]] != preservedDataFrame.loc[m2,headers[4]]:\n",
    "            factors = factors + headers[4] + \" \"\n",
    "            counter += 1  \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[5]] != preservedDataFrame.loc[m2,headers[5]]:\n",
    "            factors = factors + headers[5] + \" \"\n",
    "            counter += 1  \n",
    "\n",
    "        if preservedDataFrame.loc[m1,headers[6]] != preservedDataFrame.loc[m2,headers[6]]:\n",
    "            factors = factors + headers[6] + \" \"\n",
    "            counter += 1 \n",
    "            \n",
    "        fseries.loc[i] = factors\n",
    "\n",
    "\n",
    "    pairwiseDataFrame.loc[fseries.index,'Factor'] = fseries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Custom Vocab Len</th>\n",
       "      <th>D_Full Vocab Len</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>REPRESENT DICT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>READ DICT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  D_AUC_train D_AUC_test D_AUC_train_Onto D_AUC_test_Onto  \\\n",
       "0         NaN        NaN              NaN             NaN   \n",
       "1         NaN        NaN              NaN             NaN   \n",
       "2         NaN        NaN              NaN             NaN   \n",
       "3         NaN        NaN              NaN             NaN   \n",
       "4         NaN        NaN              NaN             NaN   \n",
       "\n",
       "  D_AUC_train_Not_Onto D_AUC_test_Not_Onto D_AUC_train_Onto_LEN  \\\n",
       "0                  NaN                 NaN                  NaN   \n",
       "1                  NaN                 NaN                  NaN   \n",
       "2                  NaN                 NaN                  NaN   \n",
       "3                  NaN                 NaN                  NaN   \n",
       "4                  NaN                 NaN                  NaN   \n",
       "\n",
       "  D_AUC_train_OntoN_LEN D_AUC_test_Onto_LEN D_AUC_test_OntoN_LEN  \\\n",
       "0                   NaN                 NaN                  NaN   \n",
       "1                   NaN                 NaN                  NaN   \n",
       "2                   NaN                 NaN                  NaN   \n",
       "3                   NaN                 NaN                  NaN   \n",
       "4                   NaN                 NaN                  NaN   \n",
       "\n",
       "        ...        D_Custom Vocab Len D_Full Vocab Len D_Testing Split  \\\n",
       "0       ...                       NaN              NaN             NaN   \n",
       "1       ...                       NaN              NaN             NaN   \n",
       "2       ...                       NaN              NaN             NaN   \n",
       "3       ...                       NaN              NaN             NaN   \n",
       "4       ...                       NaN              NaN             NaN   \n",
       "\n",
       "  D_Training Data Full D_Testing Data Full D_Training Data Final  \\\n",
       "0                  NaN                 NaN                   NaN   \n",
       "1                  NaN                 NaN                   NaN   \n",
       "2                  NaN                 NaN                   NaN   \n",
       "3                  NaN                 NaN                   NaN   \n",
       "4                  NaN                 NaN                   NaN   \n",
       "\n",
       "  D_Testing Data Final M1 M2           Factor  \n",
       "0                  NaN  0  1            DICT   \n",
       "1                  NaN  0  2       REPRESENT   \n",
       "2                  NaN  0  3  REPRESENT DICT   \n",
       "3                  NaN  0  4            READ   \n",
       "4                  NaN  0  5       READ DICT   \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                                          DICT \n",
       "1                                     REPRESENT \n",
       "2                                REPRESENT DICT \n",
       "3                                          READ \n",
       "4                                     READ DICT \n",
       "5                                READ REPRESENT \n",
       "6                           READ REPRESENT DICT \n",
       "7                                  O-Core+Super \n",
       "8                             O-Core+Super DICT \n",
       "9                        O-Core+Super REPRESENT \n",
       "10                  O-Core+Super REPRESENT DICT \n",
       "11                            O-Core+Super READ \n",
       "12                       O-Core+Super READ DICT \n",
       "13                  O-Core+Super READ REPRESENT \n",
       "14             O-Core+Super READ REPRESENT DICT \n",
       "15                          O-Core O-Core+Super \n",
       "16                     O-Core O-Core+Super DICT \n",
       "17                O-Core O-Core+Super REPRESENT \n",
       "18           O-Core O-Core+Super REPRESENT DICT \n",
       "19                     O-Core O-Core+Super READ \n",
       "20                O-Core O-Core+Super READ DICT \n",
       "21           O-Core O-Core+Super READ REPRESENT \n",
       "22      O-Core O-Core+Super READ REPRESENT DICT \n",
       "23                                        TRUNC \n",
       "24                                   TRUNC DICT \n",
       "25                              TRUNC REPRESENT \n",
       "26                         TRUNC REPRESENT DICT \n",
       "27                                   TRUNC READ \n",
       "28                              TRUNC READ DICT \n",
       "29                         TRUNC READ REPRESENT \n",
       "                          ...                   \n",
       "4530                                O-Core DICT \n",
       "4531                                     O-Core \n",
       "4532                                       DICT \n",
       "4533                                  REPRESENT \n",
       "4534                             REPRESENT DICT \n",
       "4535                                       READ \n",
       "4536                                  READ DICT \n",
       "4537                             READ REPRESENT \n",
       "4538                        READ REPRESENT DICT \n",
       "4539                             REPRESENT DICT \n",
       "4540                                  REPRESENT \n",
       "4541                                  READ DICT \n",
       "4542                                       READ \n",
       "4543                        READ REPRESENT DICT \n",
       "4544                             READ REPRESENT \n",
       "4545                                       DICT \n",
       "4546                             READ REPRESENT \n",
       "4547                        READ REPRESENT DICT \n",
       "4548                                       READ \n",
       "4549                                  READ DICT \n",
       "4550                        READ REPRESENT DICT \n",
       "4551                             READ REPRESENT \n",
       "4552                                  READ DICT \n",
       "4553                                       READ \n",
       "4554                                       DICT \n",
       "4555                                  REPRESENT \n",
       "4556                             REPRESENT DICT \n",
       "4557                             REPRESENT DICT \n",
       "4558                                  REPRESENT \n",
       "4559                                       DICT \n",
       "Name: Factor, Length: 4560, dtype: object"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame['Factor']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Custom Vocab Len</th>\n",
       "      <th>D_Full Vocab Len</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>REPRESENT DICT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>READ DICT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   D_AUC_train  D_AUC_test  D_AUC_train_Onto  D_AUC_test_Onto  \\\n",
       "0          NaN         NaN               NaN              NaN   \n",
       "1          NaN         NaN               NaN              NaN   \n",
       "2          NaN         NaN               NaN              NaN   \n",
       "3          NaN         NaN               NaN              NaN   \n",
       "4          NaN         NaN               NaN              NaN   \n",
       "\n",
       "   D_AUC_train_Not_Onto  D_AUC_test_Not_Onto  D_AUC_train_Onto_LEN  \\\n",
       "0                   NaN                  NaN                   NaN   \n",
       "1                   NaN                  NaN                   NaN   \n",
       "2                   NaN                  NaN                   NaN   \n",
       "3                   NaN                  NaN                   NaN   \n",
       "4                   NaN                  NaN                   NaN   \n",
       "\n",
       "   D_AUC_train_OntoN_LEN  D_AUC_test_Onto_LEN  D_AUC_test_OntoN_LEN  \\\n",
       "0                    NaN                  NaN                   NaN   \n",
       "1                    NaN                  NaN                   NaN   \n",
       "2                    NaN                  NaN                   NaN   \n",
       "3                    NaN                  NaN                   NaN   \n",
       "4                    NaN                  NaN                   NaN   \n",
       "\n",
       "        ...         D_Custom Vocab Len  D_Full Vocab Len  D_Testing Split  \\\n",
       "0       ...                        NaN               NaN              NaN   \n",
       "1       ...                        NaN               NaN              NaN   \n",
       "2       ...                        NaN               NaN              NaN   \n",
       "3       ...                        NaN               NaN              NaN   \n",
       "4       ...                        NaN               NaN              NaN   \n",
       "\n",
       "   D_Training Data Full  D_Testing Data Full  D_Training Data Final  \\\n",
       "0                   NaN                  NaN                    NaN   \n",
       "1                   NaN                  NaN                    NaN   \n",
       "2                   NaN                  NaN                    NaN   \n",
       "3                   NaN                  NaN                    NaN   \n",
       "4                   NaN                  NaN                    NaN   \n",
       "\n",
       "   D_Testing Data Final  M1  M2           Factor  \n",
       "0                   NaN   0   1            DICT   \n",
       "1                   NaN   0   2       REPRESENT   \n",
       "2                   NaN   0   3  REPRESENT DICT   \n",
       "3                   NaN   0   4            READ   \n",
       "4                   NaN   0   5       READ DICT   \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits = pairwiseDataFrame['Factor'].str.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame['FactorLength'] = [len(x) for x in splits]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Full Vocab Len</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "      <th>FactorLength</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>REPRESENT DICT</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>READ DICT</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   D_AUC_train  D_AUC_test  D_AUC_train_Onto  D_AUC_test_Onto  \\\n",
       "0          NaN         NaN               NaN              NaN   \n",
       "1          NaN         NaN               NaN              NaN   \n",
       "2          NaN         NaN               NaN              NaN   \n",
       "3          NaN         NaN               NaN              NaN   \n",
       "4          NaN         NaN               NaN              NaN   \n",
       "\n",
       "   D_AUC_train_Not_Onto  D_AUC_test_Not_Onto  D_AUC_train_Onto_LEN  \\\n",
       "0                   NaN                  NaN                   NaN   \n",
       "1                   NaN                  NaN                   NaN   \n",
       "2                   NaN                  NaN                   NaN   \n",
       "3                   NaN                  NaN                   NaN   \n",
       "4                   NaN                  NaN                   NaN   \n",
       "\n",
       "   D_AUC_train_OntoN_LEN  D_AUC_test_Onto_LEN  D_AUC_test_OntoN_LEN  \\\n",
       "0                    NaN                  NaN                   NaN   \n",
       "1                    NaN                  NaN                   NaN   \n",
       "2                    NaN                  NaN                   NaN   \n",
       "3                    NaN                  NaN                   NaN   \n",
       "4                    NaN                  NaN                   NaN   \n",
       "\n",
       "       ...       D_Full Vocab Len  D_Testing Split  D_Training Data Full  \\\n",
       "0      ...                    NaN              NaN                   NaN   \n",
       "1      ...                    NaN              NaN                   NaN   \n",
       "2      ...                    NaN              NaN                   NaN   \n",
       "3      ...                    NaN              NaN                   NaN   \n",
       "4      ...                    NaN              NaN                   NaN   \n",
       "\n",
       "   D_Testing Data Full  D_Training Data Final  D_Testing Data Final  M1  M2  \\\n",
       "0                  NaN                    NaN                   NaN   0   1   \n",
       "1                  NaN                    NaN                   NaN   0   2   \n",
       "2                  NaN                    NaN                   NaN   0   3   \n",
       "3                  NaN                    NaN                   NaN   0   4   \n",
       "4                  NaN                    NaN                   NaN   0   5   \n",
       "\n",
       "            Factor  FactorLength  \n",
       "0            DICT              1  \n",
       "1       REPRESENT              1  \n",
       "2  REPRESENT DICT              2  \n",
       "3            READ              1  \n",
       "4       READ DICT              2  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4560, 23)"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame['Neighbor'] = np.where(pairwiseDataFrame['FactorLength']==1,1,0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "      <th>FactorLength</th>\n",
       "      <th>Neighbor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>REPRESENT DICT</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>READ DICT</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   D_AUC_train  D_AUC_test  D_AUC_train_Onto  D_AUC_test_Onto  \\\n",
       "0          NaN         NaN               NaN              NaN   \n",
       "1          NaN         NaN               NaN              NaN   \n",
       "2          NaN         NaN               NaN              NaN   \n",
       "3          NaN         NaN               NaN              NaN   \n",
       "4          NaN         NaN               NaN              NaN   \n",
       "\n",
       "   D_AUC_train_Not_Onto  D_AUC_test_Not_Onto  D_AUC_train_Onto_LEN  \\\n",
       "0                   NaN                  NaN                   NaN   \n",
       "1                   NaN                  NaN                   NaN   \n",
       "2                   NaN                  NaN                   NaN   \n",
       "3                   NaN                  NaN                   NaN   \n",
       "4                   NaN                  NaN                   NaN   \n",
       "\n",
       "   D_AUC_train_OntoN_LEN  D_AUC_test_Onto_LEN  D_AUC_test_OntoN_LEN    ...     \\\n",
       "0                    NaN                  NaN                   NaN    ...      \n",
       "1                    NaN                  NaN                   NaN    ...      \n",
       "2                    NaN                  NaN                   NaN    ...      \n",
       "3                    NaN                  NaN                   NaN    ...      \n",
       "4                    NaN                  NaN                   NaN    ...      \n",
       "\n",
       "   D_Testing Split  D_Training Data Full  D_Testing Data Full  \\\n",
       "0              NaN                   NaN                  NaN   \n",
       "1              NaN                   NaN                  NaN   \n",
       "2              NaN                   NaN                  NaN   \n",
       "3              NaN                   NaN                  NaN   \n",
       "4              NaN                   NaN                  NaN   \n",
       "\n",
       "   D_Training Data Final  D_Testing Data Final  M1  M2           Factor  \\\n",
       "0                    NaN                   NaN   0   1            DICT    \n",
       "1                    NaN                   NaN   0   2       REPRESENT    \n",
       "2                    NaN                   NaN   0   3  REPRESENT DICT    \n",
       "3                    NaN                   NaN   0   4            READ    \n",
       "4                    NaN                   NaN   0   5       READ DICT    \n",
       "\n",
       "   FactorLength  Neighbor  \n",
       "0             1         1  \n",
       "1             1         1  \n",
       "2             2         0  \n",
       "3             1         1  \n",
       "4             2         0  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4560, 24)"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairwiseDataFrame.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "filteredPairWise = pairwiseDataFrame.loc[pairwiseDataFrame['Neighbor']==1,].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(304, 24)"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "filteredPairWise['Factor'] = filteredPairWise['Factor'].str.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "      <th>FactorLength</th>\n",
       "      <th>Neighbor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>O-Core+Super</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>24</td>\n",
       "      <td>TRUNC</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    D_AUC_train  D_AUC_test  D_AUC_train_Onto  D_AUC_test_Onto  \\\n",
       "0           NaN         NaN               NaN              NaN   \n",
       "1           NaN         NaN               NaN              NaN   \n",
       "3           NaN         NaN               NaN              NaN   \n",
       "7           NaN         NaN               NaN              NaN   \n",
       "23          NaN         NaN               NaN              NaN   \n",
       "\n",
       "    D_AUC_train_Not_Onto  D_AUC_test_Not_Onto  D_AUC_train_Onto_LEN  \\\n",
       "0                    NaN                  NaN                   NaN   \n",
       "1                    NaN                  NaN                   NaN   \n",
       "3                    NaN                  NaN                   NaN   \n",
       "7                    NaN                  NaN                   NaN   \n",
       "23                   NaN                  NaN                   NaN   \n",
       "\n",
       "    D_AUC_train_OntoN_LEN  D_AUC_test_Onto_LEN  D_AUC_test_OntoN_LEN  \\\n",
       "0                     NaN                  NaN                   NaN   \n",
       "1                     NaN                  NaN                   NaN   \n",
       "3                     NaN                  NaN                   NaN   \n",
       "7                     NaN                  NaN                   NaN   \n",
       "23                    NaN                  NaN                   NaN   \n",
       "\n",
       "      ...     D_Testing Split  D_Training Data Full  D_Testing Data Full  \\\n",
       "0     ...                 NaN                   NaN                  NaN   \n",
       "1     ...                 NaN                   NaN                  NaN   \n",
       "3     ...                 NaN                   NaN                  NaN   \n",
       "7     ...                 NaN                   NaN                  NaN   \n",
       "23    ...                 NaN                   NaN                  NaN   \n",
       "\n",
       "    D_Training Data Final  D_Testing Data Final  M1  M2        Factor  \\\n",
       "0                     NaN                   NaN   0   1          DICT   \n",
       "1                     NaN                   NaN   0   2     REPRESENT   \n",
       "3                     NaN                   NaN   0   4          READ   \n",
       "7                     NaN                   NaN   0   8  O-Core+Super   \n",
       "23                    NaN                   NaN   0  24         TRUNC   \n",
       "\n",
       "    FactorLength  Neighbor  \n",
       "0              1         1  \n",
       "1              1         1  \n",
       "3              1         1  \n",
       "7              1         1  \n",
       "23             1         1  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 176,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(304, 24)"
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',\n",
       "       'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',\n",
       "       'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',\n",
       "       'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',\n",
       "       'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',\n",
       "       'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2', 'Factor',\n",
       "       'FactorLength', 'Neighbor'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',\n",
       "       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',\n",
       "       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',\n",
       "       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',\n",
       "       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',\n",
       "       'Testing Split', 'Training Data Full', 'Testing Data Full',\n",
       "       'Training Data Final', 'Testing Data Final'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preservedDataFrame.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 304/304 [00:02<00:00, 123.83it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm.tqdm(filteredPairWise.index):\n",
    "    for k in filteredPairWise.columns[:-5]:\n",
    "        m1 = filteredPairWise.loc[i,'M1']\n",
    "        m2 = filteredPairWise.loc[i,'M2']\n",
    "        factor = filteredPairWise.loc[i,'Factor']\n",
    "        try:\n",
    "            if preservedDataFrame.loc[m1,factor] > preservedDataFrame.loc[m2,factor]:\n",
    "                filteredPairWise.loc[i,k] = preservedDataFrame.loc[m1,k[2:]] - preservedDataFrame.loc[m2,k[2:]]\n",
    "            else:\n",
    "                filteredPairWise.loc[i,k] = preservedDataFrame.loc[m2,k[2:]] - preservedDataFrame.loc[m1,k[2:]]\n",
    "        except:\n",
    "            continue\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>D_AUC_train</th>\n",
       "      <th>D_AUC_test</th>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <th>...</th>\n",
       "      <th>D_Testing Split</th>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <th>M1</th>\n",
       "      <th>M2</th>\n",
       "      <th>Factor</th>\n",
       "      <th>FactorLength</th>\n",
       "      <th>Neighbor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.006353</td>\n",
       "      <td>0.003992</td>\n",
       "      <td>-0.003085</td>\n",
       "      <td>0.001346</td>\n",
       "      <td>-0.007575</td>\n",
       "      <td>0.006743</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>DICT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.013433</td>\n",
       "      <td>0.022359</td>\n",
       "      <td>-0.018285</td>\n",
       "      <td>-0.002981</td>\n",
       "      <td>0.023340</td>\n",
       "      <td>0.028472</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>REPRESENT</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.009409</td>\n",
       "      <td>-0.014576</td>\n",
       "      <td>0.007286</td>\n",
       "      <td>0.038275</td>\n",
       "      <td>-0.010127</td>\n",
       "      <td>-0.035304</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>READ</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.008373</td>\n",
       "      <td>0.000731</td>\n",
       "      <td>0.004001</td>\n",
       "      <td>0.013654</td>\n",
       "      <td>0.010689</td>\n",
       "      <td>-0.002168</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>O-Core+Super</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>0.011228</td>\n",
       "      <td>0.033588</td>\n",
       "      <td>-0.017959</td>\n",
       "      <td>0.003631</td>\n",
       "      <td>-0.000128</td>\n",
       "      <td>0.029411</td>\n",
       "      <td>1201.0</td>\n",
       "      <td>-1201.0</td>\n",
       "      <td>209.0</td>\n",
       "      <td>-209.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>24</td>\n",
       "      <td>TRUNC</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    D_AUC_train  D_AUC_test  D_AUC_train_Onto  D_AUC_test_Onto  \\\n",
       "0     -0.006353    0.003992         -0.003085         0.001346   \n",
       "1      0.013433    0.022359         -0.018285        -0.002981   \n",
       "3     -0.009409   -0.014576          0.007286         0.038275   \n",
       "7      0.008373    0.000731          0.004001         0.013654   \n",
       "23     0.011228    0.033588         -0.017959         0.003631   \n",
       "\n",
       "    D_AUC_train_Not_Onto  D_AUC_test_Not_Onto  D_AUC_train_Onto_LEN  \\\n",
       "0              -0.007575             0.006743                   0.0   \n",
       "1               0.023340             0.028472                   0.0   \n",
       "3              -0.010127            -0.035304                   0.0   \n",
       "7               0.010689            -0.002168                   0.0   \n",
       "23             -0.000128             0.029411                1201.0   \n",
       "\n",
       "    D_AUC_train_OntoN_LEN  D_AUC_test_Onto_LEN  D_AUC_test_OntoN_LEN  \\\n",
       "0                     0.0                  0.0                   0.0   \n",
       "1                     0.0                  0.0                   0.0   \n",
       "3                     0.0                  0.0                   0.0   \n",
       "7                     0.0                  0.0                   0.0   \n",
       "23                -1201.0                209.0                -209.0   \n",
       "\n",
       "      ...     D_Testing Split  D_Training Data Full  D_Testing Data Full  \\\n",
       "0     ...                 0.0                   0.0                  0.0   \n",
       "1     ...                 0.0                   0.0                  0.0   \n",
       "3     ...                 0.0                   0.0                  0.0   \n",
       "7     ...                 0.0                   0.0                  0.0   \n",
       "23    ...                 0.0                   0.0                  0.0   \n",
       "\n",
       "    D_Training Data Final  D_Testing Data Final  M1  M2        Factor  \\\n",
       "0                     0.0                   0.0   0   1          DICT   \n",
       "1                     0.0                   0.0   0   2     REPRESENT   \n",
       "3                     0.0                   0.0   0   4          READ   \n",
       "7                     0.0                   0.0   0   8  O-Core+Super   \n",
       "23                    0.0                   0.0   0  24         TRUNC   \n",
       "\n",
       "    FactorLength  Neighbor  \n",
       "0              1         1  \n",
       "1              1         1  \n",
       "3              1         1  \n",
       "7              1         1  \n",
       "23             1         1  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(304, 24)"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [],
   "source": [
    "filteredPairWise.to_csv('{}_filteredPairWise.csv'.format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>D_AUC_train</th>\n",
       "      <td>0.866374</td>\n",
       "      <td>0.825970</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_test</th>\n",
       "      <td>0.238595</td>\n",
       "      <td>0.783248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_train_Onto</th>\n",
       "      <td>7.646677</td>\n",
       "      <td>3.262931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_test_Onto</th>\n",
       "      <td>6.426810</td>\n",
       "      <td>2.847169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_train_Not_Onto</th>\n",
       "      <td>-1.675274</td>\n",
       "      <td>0.740432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_test_Not_Onto</th>\n",
       "      <td>-2.486323</td>\n",
       "      <td>1.080627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_train_Onto_LEN</th>\n",
       "      <td>243250.000000</td>\n",
       "      <td>61010.861011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_train_OntoN_LEN</th>\n",
       "      <td>772950.000000</td>\n",
       "      <td>61010.861011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_test_Onto_LEN</th>\n",
       "      <td>44450.000000</td>\n",
       "      <td>10617.210617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_AUC_test_OntoN_LEN</th>\n",
       "      <td>130850.000000</td>\n",
       "      <td>10617.210617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_CycleTime</th>\n",
       "      <td>26.308768</td>\n",
       "      <td>111.145037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_RAND</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Custom Vocab Len</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Full Vocab Len</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Testing Split</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Training Data Full</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Testing Data Full</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Training Data Final</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D_Testing Data Final</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M1</th>\n",
       "      <td>4750.000000</td>\n",
       "      <td>2736.138084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M2</th>\n",
       "      <td>5550.000000</td>\n",
       "      <td>2736.138084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FactorLength</th>\n",
       "      <td>100.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Neighbor</th>\n",
       "      <td>100.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                mean           std\n",
       "D_AUC_train                 0.866374      0.825970\n",
       "D_AUC_test                  0.238595      0.783248\n",
       "D_AUC_train_Onto            7.646677      3.262931\n",
       "D_AUC_test_Onto             6.426810      2.847169\n",
       "D_AUC_train_Not_Onto       -1.675274      0.740432\n",
       "D_AUC_test_Not_Onto        -2.486323      1.080627\n",
       "D_AUC_train_Onto_LEN   243250.000000  61010.861011\n",
       "D_AUC_train_OntoN_LEN  772950.000000  61010.861011\n",
       "D_AUC_test_Onto_LEN     44450.000000  10617.210617\n",
       "D_AUC_test_OntoN_LEN   130850.000000  10617.210617\n",
       "D_CycleTime                26.308768    111.145037\n",
       "D_RAND                      0.000000      0.000000\n",
       "D_Custom Vocab Len          0.000000      0.000000\n",
       "D_Full Vocab Len            0.000000      0.000000\n",
       "D_Testing Split             0.000000      0.000000\n",
       "D_Training Data Full        0.000000      0.000000\n",
       "D_Testing Data Full         0.000000      0.000000\n",
       "D_Training Data Final       0.000000      0.000000\n",
       "D_Testing Data Final        0.000000      0.000000\n",
       "M1                       4750.000000   2736.138084\n",
       "M2                       5550.000000   2736.138084\n",
       "FactorLength              100.000000      0.000000\n",
       "Neighbor                  100.000000      0.000000"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].describe()[1:3].transpose()*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jw\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1472: FutureWarning: \n",
      "Passing list-likes to .loc or [] with any missing label will raise\n",
      "KeyError in the future, you can use .reindex() as an alternative.\n",
      "\n",
      "See the documentation here:\n",
      "https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n",
      "  return self._getitem_tuple(key)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"8\" halign=\"left\">D_AUC_test</th>\n",
       "      <th colspan=\"2\" halign=\"left\">D_AUC_test__Onto</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"2\" halign=\"left\">D_AUC_train</th>\n",
       "      <th colspan=\"8\" halign=\"left\">D_AUC_train__Onto</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>...</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Factor</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>DICT</th>\n",
       "      <td>48.0</td>\n",
       "      <td>-0.003197</td>\n",
       "      <td>0.010142</td>\n",
       "      <td>-0.034942</td>\n",
       "      <td>-0.009256</td>\n",
       "      <td>-0.003648</td>\n",
       "      <td>0.004984</td>\n",
       "      <td>0.014587</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.001000</td>\n",
       "      <td>0.012956</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>O-Core</th>\n",
       "      <td>32.0</td>\n",
       "      <td>0.002386</td>\n",
       "      <td>0.007832</td>\n",
       "      <td>-0.010105</td>\n",
       "      <td>-0.003225</td>\n",
       "      <td>0.001590</td>\n",
       "      <td>0.005636</td>\n",
       "      <td>0.021869</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.015727</td>\n",
       "      <td>0.029259</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>O-Core+Super</th>\n",
       "      <td>32.0</td>\n",
       "      <td>0.001154</td>\n",
       "      <td>0.005003</td>\n",
       "      <td>-0.010379</td>\n",
       "      <td>-0.002449</td>\n",
       "      <td>0.001353</td>\n",
       "      <td>0.004869</td>\n",
       "      <td>0.012165</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.001584</td>\n",
       "      <td>0.009733</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RAND</th>\n",
       "      <td>48.0</td>\n",
       "      <td>0.004587</td>\n",
       "      <td>0.010032</td>\n",
       "      <td>-0.018929</td>\n",
       "      <td>-0.000477</td>\n",
       "      <td>0.004920</td>\n",
       "      <td>0.010752</td>\n",
       "      <td>0.027761</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.002707</td>\n",
       "      <td>0.027287</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>READ</th>\n",
       "      <td>48.0</td>\n",
       "      <td>-0.023203</td>\n",
       "      <td>0.009845</td>\n",
       "      <td>-0.038659</td>\n",
       "      <td>-0.031895</td>\n",
       "      <td>-0.023738</td>\n",
       "      <td>-0.014459</td>\n",
       "      <td>0.001918</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.013142</td>\n",
       "      <td>0.023327</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>REPRESENT</th>\n",
       "      <td>48.0</td>\n",
       "      <td>0.012002</td>\n",
       "      <td>0.010824</td>\n",
       "      <td>-0.015564</td>\n",
       "      <td>0.006547</td>\n",
       "      <td>0.012238</td>\n",
       "      <td>0.019525</td>\n",
       "      <td>0.031153</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.037894</td>\n",
       "      <td>0.066563</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TRUNC</th>\n",
       "      <td>48.0</td>\n",
       "      <td>0.035339</td>\n",
       "      <td>0.012115</td>\n",
       "      <td>0.008978</td>\n",
       "      <td>0.026802</td>\n",
       "      <td>0.037289</td>\n",
       "      <td>0.043767</td>\n",
       "      <td>0.060891</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.048820</td>\n",
       "      <td>0.061448</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             D_AUC_test                                                    \\\n",
       "                  count      mean       std       min       25%       50%   \n",
       "Factor                                                                      \n",
       "DICT               48.0 -0.003197  0.010142 -0.034942 -0.009256 -0.003648   \n",
       "O-Core             32.0  0.002386  0.007832 -0.010105 -0.003225  0.001590   \n",
       "O-Core+Super       32.0  0.001154  0.005003 -0.010379 -0.002449  0.001353   \n",
       "RAND               48.0  0.004587  0.010032 -0.018929 -0.000477  0.004920   \n",
       "READ               48.0 -0.023203  0.009845 -0.038659 -0.031895 -0.023738   \n",
       "REPRESENT          48.0  0.012002  0.010824 -0.015564  0.006547  0.012238   \n",
       "TRUNC              48.0  0.035339  0.012115  0.008978  0.026802  0.037289   \n",
       "\n",
       "                                 D_AUC_test__Onto      ... D_AUC_train  \\\n",
       "                   75%       max            count mean ...         75%   \n",
       "Factor                                                 ...               \n",
       "DICT          0.004984  0.014587              0.0  NaN ...    0.001000   \n",
       "O-Core        0.005636  0.021869              0.0  NaN ...    0.015727   \n",
       "O-Core+Super  0.004869  0.012165              0.0  NaN ...    0.001584   \n",
       "RAND          0.010752  0.027761              0.0  NaN ...    0.002707   \n",
       "READ         -0.014459  0.001918              0.0  NaN ...    0.013142   \n",
       "REPRESENT     0.019525  0.031153              0.0  NaN ...    0.037894   \n",
       "TRUNC         0.043767  0.060891              0.0  NaN ...    0.048820   \n",
       "\n",
       "                       D_AUC_train__Onto                               \n",
       "                   max             count mean std min 25% 50% 75% max  \n",
       "Factor                                                                 \n",
       "DICT          0.012956               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "O-Core        0.029259               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "O-Core+Super  0.009733               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "RAND          0.027287               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "READ          0.023327               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "REPRESENT     0.066563               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "TRUNC         0.061448               0.0  NaN NaN NaN NaN NaN NaN NaN  \n",
       "\n",
       "[7 rows x 32 columns]"
      ]
     },
     "execution_count": 185,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.loc[:,['Factor','D_AUC_train','D_AUC_test','D_AUC_train__Onto','D_AUC_test__Onto',]].sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(32, 24)"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "filteredPairWise.sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe().to_csv(\"{}_consolidated.csv\".format(today))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ONLY RUN THIS CODE WHEN PRODUCING A SINGLE MODEL-RUN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### This section is useful for analyzing appropriate LDA topic model sizes and Doc2Vec vector performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for i in trainDataFinal['filteredTexts'][:20]:\n",
    "#     print(i)\n",
    "#     print()\n",
    "    \n",
    "# print(trainDataFinal.loc[11639,\"tokenizedTexts\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizedTexts = trainDataFinal.loc[:,\"tokenizedTexts\"].copy()\n",
    "\n",
    "# cdict = Dictionary(tokenizedTexts)\n",
    "\n",
    "# tokenizedList_bow = [cdict.doc2bow(x) for x in tokenizedTexts]\n",
    "\n",
    "# x = list(np.arange(2,200,10))\n",
    "# perp = []\n",
    "\n",
    "# for i in tqdm.tqdm(x):\n",
    "#     print(i)\n",
    "#     ldaModel = LdaModel(\n",
    "#         corpus=tokenizedList_bow,\n",
    "#         id2word=cdict, \n",
    "#         num_topics=i,\n",
    "#         random_state=42,\n",
    "#         passes=5,\n",
    "#         alpha=1.0/i,\n",
    "#         eta=1.0/i, \n",
    "#         eval_every=1000, \n",
    "#         iterations=5, \n",
    "#     )\n",
    "#     p = ldaModel.log_perplexity(tokenizedList_bow)\n",
    "#     print(p)\n",
    "#     perp.append(p)\n",
    "    \n",
    "# sns.set(font_scale=1.25)\n",
    "# plt.figure(figsize=(12,8))\n",
    "# ax = sns.lineplot(x=x,y=perp,palette='colorblind')\n",
    "# ax.set_title(\"LDA Perplexity\")\n",
    "# ax.set(xlabel='Number of Topics in LDA Model', ylabel='Perplexity')\n",
    "# ax.spines['bottom'].set_color('0.25')\n",
    "# ax.spines['top'].set_color('0.25')\n",
    "# ax.spines['right'].set_color('0.25')\n",
    "# ax.spines['left'].set_color('0.25')\n",
    "# plt.savefig(\"plots/LDAperplexity.png\")\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = pd.DataFrame(columns=[\n",
    "#     'Tag',\n",
    "#     'Train Tagged',\n",
    "#     'Train Not Tagged',\n",
    "#     'Train diff',\n",
    "#     'Train ttest',\n",
    "#     'Train pvalue',\n",
    "#     'Train Tagged Count',\n",
    "#     'Train Not Tagged Count',\n",
    "#     'Test Tagged',\n",
    "#     'Test Not Tagged',\n",
    "#     'Test diff',\n",
    "#     'Test ttest',\n",
    "#     'Test pvalue',\n",
    "#     'Test Tagged Count',\n",
    "#     'Test Not Tagged Count',\n",
    "#     'vs tagEASIER',\n",
    "#     'vs tagHARDER',\n",
    "#     'Intended Label',\n",
    "#     'Actual Label',\n",
    "# ])\n",
    "\n",
    "# cosinePerformance = []\n",
    "\n",
    "# for i in tagVectors:\n",
    "#     tag = str(i[0])\n",
    "#     testList = []\n",
    "#     testNOTList = []\n",
    "#     trainList = []\n",
    "#     trainNOTList = []\n",
    "#     for k in trainDataFinal.loc[trainFinalRep.index,:].index:\n",
    "#         if tag in trainDataFinal.loc[k,'manualTags']:\n",
    "#             trainList.append(k)\n",
    "#         else:\n",
    "#             trainNOTList.append(k)\n",
    "    \n",
    "#     for k in testDataFinal.loc[testFinalRep.index,:].index:\n",
    "#         if tag in testDataFinal.loc[k,'manualTags']:\n",
    "#             testList.append(k)\n",
    "#         else:\n",
    "#             testNOTList.append(k)\n",
    "    \n",
    "#     reps = pd.concat([trainFinalRep,testFinalRep])\n",
    "    \n",
    "    \n",
    "#     testListCos = []\n",
    "#     testNOTListCos = []\n",
    "#     trainListCos = []\n",
    "#     trainNOTListCos = []    \n",
    "    \n",
    "    \n",
    "#     for k in testList:\n",
    "#         testListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n",
    "                           \n",
    "#     for k in testNOTList:\n",
    "#         testNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n",
    "                              \n",
    "#     for k in trainList:\n",
    "#         trainListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n",
    "                            \n",
    "#     for k in trainNOTList:\n",
    "#         trainNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])\n",
    "\n",
    "    \n",
    "#     testTTEST = ttest_ind(a=testListCos,b=testNOTListCos,equal_var=False)\n",
    "#     trainTTEST = ttest_ind(a=trainListCos,b=trainNOTListCos,equal_var=False)\n",
    "    \n",
    "#     vseasy = cosine_similarity([i[1],d2vModel['tagEASIER']])[0,1]\n",
    "#     vshard = cosine_similarity([i[1],d2vModel['tagHARDER']])[0,1]\n",
    "    \n",
    "#     if i[0] in tags[:11]:\n",
    "#         label = 'EASY'\n",
    "#     else:\n",
    "#         label = 'HARD'\n",
    "        \n",
    "#     if vseasy > vshard:\n",
    "#         actual = 'EASY'\n",
    "#     else:\n",
    "#         actual = 'HARD'\n",
    "    \n",
    "    \n",
    "#     sample = {\n",
    "#         'Tag':i[0],\n",
    "        \n",
    "#         'Train Tagged':np.mean(trainListCos),\n",
    "#         'Train Not Tagged':np.mean(trainNOTListCos),\n",
    "#         'Train diff':np.abs(np.mean(trainListCos)-np.mean(trainNOTListCos)),\n",
    "        \n",
    "#         'Train ttest':trainTTEST[0],\n",
    "#         'Train pvalue':trainTTEST[1],\n",
    "#         'Train Tagged Count':len(trainList),\n",
    "#         'Train Not Tagged Count':len(trainNOTList),\n",
    "        \n",
    "#         'Test Tagged':np.mean(testListCos),\n",
    "#         'Test Not Tagged':np.mean(testNOTListCos),\n",
    "#         'Test diff':np.abs(np.mean(testListCos)-np.mean(testNOTListCos)),\n",
    "        \n",
    "#         'Test ttest':testTTEST[0],\n",
    "#         'Test pvalue':testTTEST[1],\n",
    "#         'Test Tagged Count':len(testList),\n",
    "#         'Test Not Tagged Count':len(testNOTList),\n",
    "        \n",
    "#         'vs tagEASIER':vseasy,\n",
    "#         'vs tagHARDER':vshard,\n",
    "#         'Intended Label':label,\n",
    "#         'Actual Label':actual,\n",
    "        \n",
    "#     }\n",
    "# #     print(sample)\n",
    "#     cosinePerformance.append(sample)\n",
    "# pd.DataFrame(cosinePerformance,columns=df.columns).to_csv('ttestOntology.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}