{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = pd.read_csv('raw_report.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "raw.loc[15565,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.drop(index=15565)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dropList = [\n",
    "    2062896,    7473118,    1722390,        6794102,    2317395,    2149084,    29572788,    3965629,    34467296,\n",
    "    2922840,    2879700,    38680925,    18901396,    2782099,    5820834,    36565513,    7444317,    2851588,\n",
    "    37271324,    5328272,    1999912,    23391049,    33190257,    5908857,    22846563,    38030852,    31054720,\n",
    "    1945386,    36758173,    37045153,    18493342,    39249867,    29770945,    1833912,    33012109,    32108530,\n",
    "    23008973,    34047208,    17289337,    1892914,    36273475,    37312913,    36245592,    7507894,    32928435,\n",
    "    4503562,    20363070,    29344325,    5779893,    31985520,    32358617,    32034901,    2760006,    1639532,\n",
    "    24961539,    37977984,    5802971,    4634115,    39250157,    32928397,    33490536,    30045940,    4929972,\n",
    "    36094645,    6835327,    37586409,    39199121,    32256818,    1704242,    3798636,    17212789,    4410883,\n",
    "    6385249,    4433455,    2055864,    4949358,    5912724,    15837672,    36325835,    33445471,    36618229,\n",
    "    4362213,    33956835,    32499331,    36325509,    36868257,    32061452,    15285712,    36115975,    32059951,\n",
    "    36860036,    32481177,    32002920,    34431545,    36656636,    36325301,    34197726,    33129443,    5796048,\n",
    "    34303712,    32389983,    34248344,    37256129,    36173758,    36638493,    36744373,    32841709,    20307399,\n",
    "    23716990,    2017834,    39095394,    39199516,    2055869,    2055877,    39225250,    37758302,    17201891,\n",
    "    2327318,    37508718,    17201438,    7491251,    23561779,    39377545,    7491251,    23561779,    39377545,\n",
    "    39108711,    18228907,    20348425,    25002999,    38190739,    38043997,    36131692,    18448844,\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "e = []\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'Id'] in dropList:\n",
    "        e.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.drop(index=e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.rename(index=str, columns={'Answer (ignore the character limit)' : 'Answer',\n",
    "                                    'Notes: (e.g., full question, sources consulted / recommended, problems, etc.)' : 'Notes FULL',\n",
    "                                    'Who answered (see list of initials​)' : 'Who answers',\n",
    "                                    'READ Scale (visit READ Scale site)':'READ',\n",
    "                                   })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time_spent = ['0-1','2-5','5-Feb','10-Jun','20-Nov','21-60','61+']\n",
    "time_spent_replace = ['0-1 min','2-5 min','2-5 min','6-10 min','11-20 min','21-60 min','61+ min']\n",
    "raw['Time Spent'] = raw['Time Spent'].replace(to_replace=time_spent, value=time_spent_replace)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "faq = pd.read_csv('frequentQuestions.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    if raw.loc[i,'Question'] in list(faq.loc[:,0]):\n",
    "        raw.loc[i,'Question'] = 'NULL'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.fillna('NULL')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['READ']=='unknown','READ'] = \"Unknown\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[(raw['Question Type']=='NULL')|(raw['Question Type']=='unknown'),'Question Type'] = 'Unknown'\n",
    "raw.loc[raw['Question Type']=='Research Consultation','Question Type'] = 'ResearchConsultation'\n",
    "raw.loc[raw['Question Type']=='K-REx','Question Type'] = 'KREx'\n",
    "raw.loc[raw['Question Type']=='New Prairie Press','Question Type'] = 'NewPrairiePress'\n",
    "raw.loc[raw['READ']=='NULL','READ'] = 'Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Question Type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['READ'] == '10-Jun','Id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['READ'] == '10-Jun','READ'] = 'NULL'\n",
    "raw.loc[raw['READ'] == '1 (minimal effort)','READ'] = '1'\n",
    "raw.loc[raw['READ'] == '6 (maximal effort)','READ'] = '6'\n",
    "raw['Id'] = raw['Id'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw['READ_1_vs_2'] = int(0)\n",
    "raw['READ_2_vs_3'] = int(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    if raw.loc[i,'READ'] in ['3','4','5','6']:\n",
    "        raw.loc[i,'READ_1_vs_2'] = int(1)\n",
    "        raw.loc[i,'READ_2_vs_3'] = int(1)\n",
    "        \n",
    "    if raw.loc[i,'READ'] in ['2']:\n",
    "        raw.loc[i,'READ_1_vs_2'] = int(1)\n",
    "        raw.loc[i,'READ_2_vs_3'] = int(0)\n",
    "        \n",
    "    if raw.loc[i,'READ'] in ['1']:\n",
    "        raw.loc[i,'READ_1_vs_2'] = int(0)\n",
    "        raw.loc[i,'READ_2_vs_3'] = int(0)\n",
    "        \n",
    "    if raw.loc[i,'READ'] in ['Unknown']:\n",
    "        raw.loc[i,'READ_1_vs_2'] = int(999)\n",
    "        raw.loc[i,'READ_2_vs_3'] = int(999)\n",
    "        \n",
    "    if raw.loc[i,'READ'] in ['unknown']:\n",
    "        raw.loc[i,'READ_1_vs_2'] = int(999)\n",
    "        raw.loc[i,'READ_2_vs_3'] = int(999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# raw[['READ','READ_1_vs_2','READ_2_vs_3']].head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['READ'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Categorical(raw['READ']).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Id'] = raw['Id'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Question'] = raw['Question'].astype(str)\n",
    "raw['Answer'] = raw['Answer'].astype(str)\n",
    "raw['Notes FULL'] = raw['Notes FULL'].astype(str)\n",
    "raw['Transcript'] = np.NaN\n",
    "raw['TransLength'] = np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw['Start Date'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Start Date'] = pd.to_datetime(raw['Start Date'])\n",
    "raw['End Date'] = pd.to_datetime(raw['End Date'])\n",
    "raw['Start Time'] = pd.to_datetime(raw['Start Time']).dt.time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Start Date'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.drop_duplicates(subset=['Question','Answer','Question Type','READ','tags','Time Spent'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(raw['Question'].value_counts())[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    a = raw.loc[i,'Question']\n",
    "    b = raw.loc[i,'Answer']\n",
    "    c = raw.loc[i,'Notes FULL']\n",
    "    \n",
    "    la = len(a)\n",
    "    lb = len(b)\n",
    "    lc = len(c)\n",
    "    \n",
    "    k = 'null'\n",
    "    if (la > lb) & (la > lc):\n",
    "        k = a\n",
    "    elif (lb > la) & (lb > lc):\n",
    "        k = b\n",
    "    else:\n",
    "        k = c\n",
    "    raw.loc[i,'Transcript'] = k\n",
    "    raw.loc[i,'TransLength'] = len(k)\n",
    "    print(i,la,lb,lc,len(k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['TransLength'].quantile(.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengthFilter = raw.loc[((raw['TransLength'] > raw['TransLength'].quantile(.1)) & \n",
    "                    (raw['TransLength'] < raw['TransLength'].quantile(.9))),].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengthFilter.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengthFilter['TransLength'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['Id']==24412048,'Transcript'] = \"\"\"     sample redacted     \"\"\"\n",
    "raw.loc[raw['Id']==2124740,'Transcript'] = \"\"\"     sample redacted     \"\"\"\n",
    "raw.loc[raw['Id']==24489117,'Transcript'] = \"\"\"     sample redacted     \"\"\"\n",
    "raw.loc[raw['Id']==1730574,'Transcript'] = \"\"\"     sample redacted     \"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filter_list = list(lengthFilter.loc[(lengthFilter['tags'].str.contains('patron - very happy') |\n",
    "                            lengthFilter['tags'].str.contains('patron - very unhappy')),:].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Categorical(lengthFilter.loc[filter_list,'READ']).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Categorical(lengthFilter.loc[filter_list,'Question Type']).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def newSegmenter(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    seg = []\n",
    "    \n",
    "    for i in df.index:\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        seglist = []\n",
    "            \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\S{6,}\\s|^\\d{1,2}:\\d{2}\\sme\\s\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\S{6,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "            \n",
    "            seg.append(seglist)\n",
    "        else:\n",
    "            seg.append('')\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segTrans'] = newSegmenter(raw)\n",
    "raw['segCode'] = int()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newSegmenter(raw.loc[raw['Id']==18883437,])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(raw.loc[raw['Id']==18883437,'Transcript'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw.index==39108711,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['Id']==17171207,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def referralSegmenter(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.startswith(\"Referring URL: \") & df['segCode'] == 1,].copy()\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "       \n",
    "        try:\n",
    "            t = re.split(pattern='\\s', string=df.loc[i,'Transcript'], maxsplit=3)[3]\n",
    "        except:\n",
    "            continue\n",
    "            \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\S{6,}\\s\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\d{5,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "        else:\n",
    "            continue\n",
    "            \n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nr = referralSegmenter(raw)\n",
    "raw.loc[nr.index,'segTrans'] = nr\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']==1,].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def errorTimeSegmenter(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.startswith(\":\") & df['segCode'] == 1,].copy()\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "\n",
    "        t = df.loc[i,'Transcript']\n",
    "        if re.match(pattern=\"^:\\d{2}\\s\\S{6,}\\s\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\d{5,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "net = errorTimeSegmenter(raw)\n",
    "raw.loc[net.index,'segTrans'] = net\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = \"12:11 78578940005542826253835712 Hi I am wond\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if re.match(pattern=\"^\\d{0,2}:\\d{2}\\s\\S{6,}\\s\", string=t):\n",
    "    print(True)\n",
    "else:\n",
    "    print(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']==1,['Question','Answer','Transcript'],]  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def recombinerSegmenters(df):\n",
    "    df = df.copy()\n",
    "    df = df.loc[df['segCode']==1,]\n",
    "    seg = pd.Series()\n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        if re.match(pattern=\"^\\d{0,2}:\\d{2}\\s\\S{6,}\\s\", string=df.loc[i,'Question']):\n",
    "            if re.match(pattern=\"^\\d{0,2}:\\d{2}\\sme\\s\", string=df.loc[i,'Transcript']):\n",
    "                t = df.loc[i,'Question'] + ' ' + df.loc[i,'Transcript']\n",
    "                \n",
    "                if re.match(pattern=\"^\\d{0,2}:\\d{2}\\s\\S{6,}\",string=t):\n",
    "                    nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\S{6,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "                    for k in nt:\n",
    "                        seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "        \n",
    "        seg.loc[i] = seglist\n",
    "             \n",
    "                \n",
    "                \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nrs = recombinerSegmenters(raw)\n",
    "nrs = nrs[nrs.str.len() > 0].copy()\n",
    "\n",
    "raw.loc[nrs.index,'segTrans'] = nrs\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.Transcript.duplicated().value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc['10158','Transcript'])\n",
    "print('\\n')\n",
    "print(raw.loc['10158','Question'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc['10159','Transcript'])\n",
    "print(raw.loc['10159','Question'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw.loc[raw.duplicated(subset=['Transcript','Question','Answer'],keep=False),['Id','Time Spent','tags','READ','Question Type','Answer','Question','Transcript']].sort_values(by=['Transcript'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='3rd floor noise complaint'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='Noise complaint'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='Scanner help'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='noise complaint'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='room reservation'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='HLH - FILLER'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='Do you have this resource (article, book, textbook, etc.)?'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='University Experience Scavenger Hunt'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='HLH - Do you have this book (or textbook)?'].index)\n",
    "raw = raw.drop(index=raw.loc[raw['Transcript']=='NULL'].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw.loc[raw.duplicated(subset=['Transcript'],keep=False),['Id','Time Spent','tags','READ','Question Type','Answer','Question','Transcript']].sort_values(by=['Transcript'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weirdTimeSegmenter(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['segCode']==1,]\n",
    "    df = df.loc[df['Transcript'].str.contains(\"^\\d{1,2}:\\d{2}\\D{1,2}\\s\\S{6,}\\S{5,}\", regex=True),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "            \n",
    "#         try:\n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\D{1,2}\\s\\S{6,}\\S{5,}\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\D{2}\\s\\S{5,})\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "                \n",
    "        else:\n",
    "            continue\n",
    "#         except:\n",
    "#             seglist.append('Error')\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc['6119','Transcript'] = \"\"\"02:41PM 47187626333404367048001996@web.libraryh3lp.com how annoying is it to listen to the beeping for chats all day long? We were just told that you hear a beep until you pick up the chat. 02:39PM hale-ask-a-librarian@chat.libraryh3lp.com Some folks work more hours a day on it than others. I am one of many who do it only a few hours a week. It doesn\\'t really bother me because it calls my attention away from what I am doing and alerts me that I need to pay attention to the question coming in. However, I can transfer this question to someone who works many more hours a week than I do if you would like to get a 2nd opinion. 02:39PM hale-ask-a-librarian@chat.libraryh3lp.com The chats are normally spread out enough that they don\\'t pile on top of each other. When they do, just having to juggle 2-4 at once is a challenge. 02:41PM 47187626333404367048001996@web.libraryh3lp.com Do you ever get confused on which chat to answer first? 02:42PM hale-ask-a-librarian@chat.libraryh3lp.com A little. The new chat boxes can literally hide behind one you have open, so you have to click and drag to expose the new question. 02:44PM hale-ask-a-librarian@chat.libraryh3lp.com However, as you should see, there are times embedded in the chat so we can see who is first. We try to quickly answer Hello to let somone know that we are here. I found when I try to say much more besides Hello to begin with, another person monitoring may think the chat isn\\'t being picked up and will take it to make sure the question is being answered. 02:44PM hale-ask-a-librarian@chat.libraryh3lp.com There are times you are immersed in truly difficult questions, so that is a challenge when multiple chats hit at once. 02:45PM 47187626333404367048001996@web.libraryh3lp.com What is the weirdest thing anyone has asked you on the chats? 02:45PM hale-ask-a-librarian@chat.libraryh3lp.com Oh my. Let me think; there have been some odd things. 02:47PM hale-ask-a-librarian@chat.libraryh3lp.com At the moment there isn\\'t anything that pops up first, but we do occasionally get what I assume are guys who get fresh and want a date or something. I tell them that they probably aren\\'t interested in anyone with 6 grandchildren. 02:50PM 47187626333404367048001996@web.libraryh3lp.com My mom uses the phrase \"get fresh\"! That must be crazy. They have no idea who they are even talking to! 02:51PM hale-ask-a-librarian@chat.libraryh3lp.com Very true! 02:51PM hale-ask-a-librarian@chat.libraryh3lp.com There are men who answer chat also, and that might surprise some folks. 02:54PM hale-ask-a-librarian@chat.libraryh3lp.com I try to make an effort to be friendly and show genuine empathy. As we were chatting another question came in about a book that is on Reserves. I told the person that it isn\\'t checked out at the moment and wished them good luck on the assignment. If it is a cold day, I\\'ll sometimes say something about staying warm, or if it is horrifically hot, maybe say something about staying cool. I think it humanizes the chat service, and we need to make those \"I care\" connections with the patrons. 02:55PM hale-ask-a-librarian@chat.libraryh3lp.com I wish I could remember a crazy chat at the moment, because I know I\\'ve had a few strange one. 02:55PM hale-ask-a-librarian@chat.libraryh3lp.com ones. 02:56PM 47187626333404367048001996@web.libraryh3lp.com That is an amazing idea! I always feel like I am chatting with a computer; and I think making that connection is the real difference in making someone feel comfortable with the library and the \"ask a librarian\" feature. 02:56PM hale-ask-a-librarian@chat.libraryh3lp.com We have a phone line, too, and for a while there was a guy from Colorado who would call and ask us to look up certain websites. I realized he was listening to late night radio--Coast to Coast a.m.--and he wanted to know things ranging from politcal conspiracy theories to alien landings. 02:57PM hale-ask-a-librarian@chat.libraryh3lp.com I\\'m glad we can help you feel connected. There really are human beings answering our chat. 02:57PM 47187626333404367048001996@web.libraryh3lp.com Thank you so much! I hope you have a beautiful day. Even though the sky looks gloomy; it is beautiful out there! I enjoyed this conversation. 02:57PM hale-ask-a-librarian@chat.libraryh3lp.com It wasn\\'t an IM. He didn\\'t have a computer so he would call us. We all got to know his voice really we.. 02:57PM hale-ask-a-librarian@chat.libraryh3lp.com well. 02:58PM hale-ask-a-librarian@chat.libraryh3lp.com Thank you! I also you have a terrific day! I also enjoyed the conversation. People seldom ask these kinds of questions. 02:58PM hale-ask-a-librarian@chat.libraryh3lp.com Take care! 03:01PM 47187626333404367048001996@web.libraryh3lp.com Hey Hale! 03:02PM hale-ask-a-librarian@chat.libraryh3lp.com I haven\\'t closed my window--want to talk to someone else? 03:02PM hale-ask-a-librarian@chat.libraryh3lp.com I assume you are the one I\\'ve talked to a few minutes. 03:02PM 47187626333404367048001996@web.libraryh3lp.com Oh! Yes! I am totally good talking with you again! I was just wondering what job opportunities are offered to students through the library? 03:03PM hale-ask-a-librarian@chat.libraryh3lp.com we have a variety of things. We have several who work in user services and are at the help desk. We have students who work in Archives and Special Collections, and we have students who work for Building Services. 03:03PM hale-ask-a-librarian@chat.libraryh3lp.com Let me try to find the application instructions. 03:04PM hale-ask-a-librarian@chat.libraryh3lp.com http://www.lib.k-state.edu/jobs 03:04PM hale-ask-a-librarian@chat.libraryh3lp.com Scroll down to Student positions. 03:04PM hale-ask-a-librarian@chat.libraryh3lp.com Oh--we also have students in Interlibrary Services. 03:05PM hale-ask-a-librarian@chat.libraryh3lp.com I\\'m sure there are more places, but those are what I\\'m thinking of off the top of my head. 03:06PM 47187626333404367048001996@web.libraryh3lp.com Thank you so much, again! Have a wonderful day! 03:06PM hale-ask-a-librarian@chat.libraryh3lp.com You too!\"\"\"\n",
    "raw.loc['10730','Transcript'] = \"\"\"01:25PM 76952912671820322253094861@web.libraryh3lp.com is the KSU database the same as the following link? It is for a new class but if its the same I will disregard and use the database resource? thanks 06:25PM 76952912671820322253094861@web.libraryh3lp.com https://k-state.instructure.com/courses/38718/external_tools/3399 01:26PM hale-ask-a-librarian@chat.libraryh3lp.com hi 01:26PM hale-ask-a-librarian@chat.libraryh3lp.com I'm checking ... be right there 01:27PM hale-ask-a-librarian@chat.libraryh3lp.com ok, so that is a research guide that is embedded into KState online/canvas for you to use for class 01:28PM hale-ask-a-librarian@chat.libraryh3lp.com I can't see it because I'm not a member of the class, but yes, it should link you to everything that you would get to from the library's website 01:28PM 76952912671820322253094861@web.libraryh3lp.com what makes it different from using ksu database for peer review and articles? 01:28PM 76952912671820322253094861@web.libraryh3lp.com ok thanks. I am just trying to unclog my mind with unnecessary resources. thanks 01:28PM hale-ask-a-librarian@chat.libraryh3lp.com I understand .... this semester was the first semester that we put these in KState Online 01:29PM hale-ask-a-librarian@chat.libraryh3lp.com we thought it might be easier if there was just one place you had to go 01:29PM 76952912671820322253094861@web.libraryh3lp.com ok thanks a bunch, I am trying not to learn \"new\" things if I don't have to since I am at the end of my journey. 01:29PM hale-ask-a-librarian@chat.libraryh3lp.com So you get the class info and then the research info without going out to a different website 01:29PM hale-ask-a-librarian@chat.libraryh3lp.com Understandable ... it's all the same so use whichever you are comfortable/familiar with 01:30PM hale-ask-a-librarian@chat.libraryh3lp.com ok, have a good day 01:30PM 76952912671820322253094861@web.libraryh3lp.com thanks! :) you too\"\"\"\n",
    "raw.loc['362','Transcript'] = \"\"\"2:59 980725465186257027863508592 Hi there, I was wondering if the library computers had Windows Movie Maker or some other sort of video editing program on them? 2:59 me Hi 2:59 me The computers in the Media Development Center have some editing software 3:00 me Actually there's several programs - I'm not certain if Movie Maker is among them, but they have several options 3:00 me That's on the 2nd floor with iTAC 3:01 98072546518625702786350859 Okay, thank you so much for your help 3:01 me You're very welcome! 2:59 me Hi 2:59 me The computers in the Media Development Center have some editing software 3:00 me Actually there's several programs - I'm not certain if Movie Maker is among them, but they have several options 3:00 me That's on the 2nd floor with iTAC 3:01 98072546518625702786350859 Okay, thank you so much for your help 3:01 me You're very welcome!\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "weirdTimeSegmenter(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nws = weirdTimeSegmenter(raw)\n",
    "\n",
    "raw.loc[nws.index,'segTrans'] = nws\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = raw.drop(index=raw.loc[raw['TransLength'] <100,].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[(raw['TransLength'] <100) &\n",
    "        (raw['segCode']==1),].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']==1,].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[(raw['segCode']==1) & \n",
    "                 (raw['TransLength'] < 5000) & \n",
    "                 ~(raw['Transcript'].str.contains(\"\\d{5,}\",regex=True)) &\n",
    "                 ~(raw['Question'].str.contains(\"\\d{5,}\",regex=True)),].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']==1,'TransLength'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def guestIPsegmenter(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.contains(\". The guest's IP is \", regex=False),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        t = re.split(string=t,pattern=\"The guest\\'s IP is\\s\\S{6,}\\s\")[1]    \n",
    "        \n",
    "        t = re.sub(pattern=\"(?<=Guest)\\s(?=\\d{2,})|(?<=\\d{2})\\s(?=PM|AM)\", repl=\"\",string=t)\n",
    "        t = re.sub(pattern=\"(?<=\\d{2}(AM|PM))\\sfrom\\s(?=Guest)|(?<=\\d{2}(AM|PM))\\sfrom\\s(?=me\\s)\", repl=\" \", string=t)\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\D{2}\\sGuest\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\D{2}\\sGuest)|\\s(?=\\d{1,2}:\\d{2}\\D{2}\\sme)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "guestIPsegmenter(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nwip = guestIPsegmenter(raw)\n",
    "\n",
    "raw.loc[nwip.index,'segTrans'] = nwip\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textConvo(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.contains(\"\\d{2}\\s\\+\\d{4,}\", regex=True),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\+\\d{7,}\\s\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\+\\d{7,})|\\s(?=\\d{1,2}:\\d{2}\\sme)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "textConvo(raw).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ntxt = textConvo(raw)\n",
    "\n",
    "raw.loc[ntxt.index,'segTrans'] = ntxt\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['Id']==17171207,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def newSegmenterUsernames(df):\n",
    "    df = df.copy()\n",
    "    df = df.loc[df['segCode']==1,]\n",
    "    df = df.loc[df['Transcript'].str.contains(\"^\\d{1,2}:\\d{2}\\s\\d{6,}\", regex=True),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\d{6,}\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{0,2}:\\d{2}\\s\\S{2,})\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newSegmenterUsernames(raw).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nseg = newSegmenterUsernames(raw)\n",
    "\n",
    "raw.loc[nseg.index,'segTrans'] = nseg\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sendFileBlock(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.contains(\" DON'T DELETE ME! \", regex=False),]\n",
    "    df = df.loc[df['Transcript'].str.contains(\" Send File Block \", regex=False),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        t = re.split(string=t,pattern=\" Send File Block \")[1]    \n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\S{2,}\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\S{5,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sendFileBlock(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nblock = sendFileBlock(raw)\n",
    "\n",
    "raw.loc[nblock.index,'segTrans'] = nblock\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def strangeGuest(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[df['Transcript'].str.contains(\" Guest \\d{3,} \", regex=True),]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Transcript']\n",
    "        if re.match(pattern=\"^This chat is from \",string=t):\n",
    "            t = re.split(string=t,pattern=\"\\sfrom\\s\\S{4,}\\s{1}\")[1]\n",
    "        \n",
    "        if re.match(pattern=\"^From \",string=t):\n",
    "            t = re.split(string=t,pattern=\"rom\\s\\S{2,}\\s\")[1]  \n",
    "        \n",
    "        t = re.sub(pattern=\"(?<=Guest)\\s(?=\\d{2,})|(?<=\\d{2})\\s(?=PM|AM)\", repl=\"\",string=t)\n",
    "        t = re.sub(pattern=\"(?<=\\d{2}(AM|PM))\\sfrom\\s(?=Guest)|(?<=\\d{2}(AM|PM))\\sfrom\\s(?=me\\s)\", repl=\" \", string=t)\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\D{2}\\sGuest\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\D{2}\\sGuest)|\\s(?=\\d{1,2}:\\d{2}\\D{2}\\s\\S{2,}\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "strangeGuest(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "strg = strangeGuest(raw)\n",
    "\n",
    "raw.loc[strg.index,'segTrans'] = strg\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def questionCombinerSimple(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[(df['segCode']==1) &\n",
    "                 (df['Question'].str.contains(\"^\\d{1,2}:\\d{2}\", regex=True)) &\n",
    "                 (df['Transcript'].str.contains(\"^\\d{0,2}:{0,1}\\d{1,2}\", regex=True))\n",
    "                 ,]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        t = df.loc[i,'Question'] + ' ' + df.loc[i,'Transcript']\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\S{4,}\\s)|\\s(?=\\S{0,3}\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questionCombinerSimple(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qcsimp = questionCombinerSimple(raw)\n",
    "\n",
    "raw.loc[qcsimp.index,'segTrans'] = qcsimp\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def questionCombinerAdding(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.loc[(df['segCode']==1) &\n",
    "                 (df['Transcript'].str.contains(\"^\\d{0,2}:\\d{2}\\sme\", regex=True)) &\n",
    "                 (df['Question'].str.len() > 20) &\n",
    "                 (df['Question'].str.contains(\"\\?|\\d{0,2}:\\d{2}\", regex=True)) &\n",
    "                 (df['Question'].str.contains(\" I |you\", regex=True))\n",
    "                 ,]\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        seglist = []\n",
    "        q = \"25:25 12345678901234567890\" + ' ' + df.loc[i,'Question']\n",
    "        t = q + ' ' + df.loc[i,'Transcript']\n",
    "        \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\S{4,}\\s)|\\s(?=\\S{0,3}\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "\n",
    "        seg.loc[i] = seglist\n",
    "            \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questionCombinerAdding(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qcadd = questionCombinerAdding(raw)\n",
    "\n",
    "raw.loc[qcadd.index,'segTrans'] = qcadd\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(raw.loc[raw['segCode']==1,].shape)\n",
    "print(raw.loc[raw['segCode']==0,].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['Transcript'].str.len().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for i in raw.loc[(raw['segCode']==1) &\n",
    "                 (raw['Transcript'].str.len() < 200)\n",
    "                 ,].sample(5).index:\n",
    "    print('\\n')\n",
    "    print(i)\n",
    "    print(raw.loc[i,'Entered by'])\n",
    "    print(raw.loc[i,'Who answers'])\n",
    "    print(raw.loc[i,'Id'])\n",
    "    print(raw.loc[i,'Question Type']),\n",
    "    print(\"QUESTION: \",raw.loc[i,'Question'])\n",
    "    print()\n",
    "    print(raw.loc[i,'Transcript'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc['6619','segTrans']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc['6619','segTrans'][0][1] = 'patron'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc['6619','segTrans'][0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#109 operator IDs redacted\n",
    "operators = [\n",
    "    \n",
    "    \"\"\" ALL OPERATOR IDS REDACTED DUE TO PRIVACY CONCERNS  \"\"\"\n",
    "    \n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"suepray\" in operators:\n",
    "    print(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "p = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        if re.match(pattern=\"\\d{15,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"\\+\\d{7,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"\\d{7,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"^Guest\\d{3,4}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "            \n",
    "        if k[1] in operators:\n",
    "            k[1] = 'staff'    \n",
    "        p.append(k[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(p).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        if (k[1] != 'staff') & (k[1] != 'patron'):\n",
    "            raw.loc[i,'segCode'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def newSegmenterDigits(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df.index:\n",
    "        t = df.loc[i,'Transcript']\n",
    "        \n",
    "        seglist = []\n",
    "            \n",
    "        if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\d{6,}\\s|^\\d{1,2}:\\d{2}\\sme\\s\",string=t):\n",
    "            nt = re.split(pattern=\"\\s(?=\\d{1,2}:\\d{2}\\s\\d{6,}\\s)|\\s(?=\\d{1,2}:\\d{2}\\sme\\s)\", string=t)\n",
    "            for k in nt:\n",
    "                seglist.append(re.split(pattern=\"\\s\",string=k,maxsplit=2))\n",
    "            \n",
    "            \n",
    "        else:\n",
    "            continue\n",
    "         \n",
    "        seg.loc[i] = seglist\n",
    "        \n",
    "        \n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nsdi = newSegmenterDigits(raw[raw['segCode']>0])\n",
    "\n",
    "raw.loc[nsdi.index,'segTrans'] = nsdi\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# newSegmenterDigits(raw[raw['segCode']>0])\n",
    "p = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        if re.match(pattern=\"\\d{15,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"\\+\\d{7,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"\\d{7,}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "        if re.match(pattern=\"^Guest\\d{3,4}\", string=k[1]):\n",
    "            k[1] = 'patron'\n",
    "            \n",
    "        if k[1] in operators:\n",
    "            k[1] = 'staff'    \n",
    "        p.append(k[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(p).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "problems = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        if (k[1] != 'staff') & (k[1] != 'patron'):\n",
    "            problems.append(i)\n",
    "            continue\n",
    "problems = np.unique(problems)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in problems:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        print(k)\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        if re.match(pattern=\"^me\", string=k[1]):\n",
    "            print(i)\n",
    "            print(raw.loc[i,'segTrans'])\n",
    "            print(raw.loc[i,'READ'])\n",
    "            print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['TransLength'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw['segCode'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']==1,].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.loc[raw['segCode']>0,['Id','Who answers','Question','Answer','Transcript','segTrans','TransLength','segCode']].to_csv(\"errorNotes.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def anotherCombinerPartial(df):\n",
    "    \n",
    "    manualList = [\n",
    "        1730574,\n",
    "        1818642,\n",
    "        1956449,\n",
    "        3982553,\n",
    "        4855425,\n",
    "        5327849,\n",
    "        15968272,\n",
    "        15975821,\n",
    "        20067306,\n",
    "        29023598,\n",
    "        29574714,\n",
    "        29757550,\n",
    "        39263779,\n",
    "    ]\n",
    "    \n",
    "    df = df.copy()\n",
    "    \n",
    "    seg = pd.Series()\n",
    "    \n",
    "    for i in df[df['segCode']==0].index:\n",
    "        if df.loc[i,'segTrans'][0][1] == 'staff':\n",
    "            q = df.loc[i,'Question']\n",
    "            l = df.loc[i,'segTrans']\n",
    "            if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\d{6,}\\s|^:\\d{2}\\s\\d{6,}\\s|^\\d{2}\\s\\d{6,}\\s\", string=q):\n",
    "                st = re.split(pattern=\"\\s\", string=q, maxsplit=2)\n",
    "                st[1] = 'patron'\n",
    "                l.insert(0,st)\n",
    "                seg.loc[i] = l\n",
    "            \n",
    "            if re.match(pattern=\"^\\d{1,2}:\\d{2}\\s\\S{2,}\\s\", string=q):\n",
    "                st = re.split(pattern=\"\\s\", string=q, maxsplit=2)\n",
    "                st[1] = 'patron'\n",
    "                l.insert(0,st)\n",
    "                seg.loc[i] = l\n",
    "                \n",
    "            if re.match(pattern=\"^\\d{6,}\\s\", string=q):\n",
    "                st = re.split(pattern=\"\\s\", string=q, maxsplit=1)\n",
    "                st[0] = 'patron'\n",
    "                st.insert(0,'00:00')\n",
    "                l.insert(0,st)\n",
    "                seg.loc[i] = l\n",
    "            \n",
    "            if re.match(pattern=\"^could\\s|^is\\s|^if\\s|^hey\\s|^hello\\S{0,1}\\s|^hi\\S{0,1}\\s|^hello\\s|^does\\s|^do\\s|^how\\s|^i\\s|^i'm\\s|^who\\s|^where\\s|^what\\s|^can\\s|^why\\s\", string=q, flags=re.IGNORECASE):\n",
    "                st = ['00:00','patron',q]\n",
    "                l.insert(0,st)\n",
    "                seg.loc[i] = l\n",
    "            \n",
    "            if df.loc[i,'Id'] in manualList:\n",
    "                st = ['00:00','patron',q]\n",
    "                l.insert(0,st)\n",
    "                seg.loc[i] = l\n",
    "            \n",
    "            \n",
    "            \n",
    "            else:\n",
    "                continue\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "    return(seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "anotherCombinerPartial(raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "acp = anotherCombinerPartial(raw)\n",
    "\n",
    "raw.loc[acp.index,'segTrans'] = acp\n",
    "\n",
    "for i in raw.index:\n",
    "    if raw.loc[i,'segTrans'] == '':\n",
    "        raw.loc[i,'segCode'] = 1\n",
    "    else:\n",
    "        raw.loc[i,'segCode'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for i in raw[raw['segCode']==0].index:\n",
    "    if raw.loc[i,'segTrans'][0][1] == 'staff':\n",
    "#         print(raw.loc[i,'Transcript'])\n",
    "#         for j in raw.loc[i,'segTrans'][:5]:\n",
    "#             print(j)\n",
    "        print(raw.loc[i,'Question'])\n",
    "        print(raw.loc[i,'Id'])\n",
    "        print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans'][0:1]:\n",
    "        try:\n",
    "            if re.search(pattern=\"transfer from\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 3:\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue\n",
    "            \n",
    "mod = []\n",
    "for i in raw.loc[ids,'segTrans'].index:\n",
    "    mod.append(raw.loc[i,'segTrans'][1:])\n",
    "raw.loc[ids,'segTrans'] = mod    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"transfer from\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 3:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"has left the conversation\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 4:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"answered by\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 3:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"transferring to \\w+\\.{3} please wait\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 5:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"continued from\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 3:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "#                     ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"this is a continuing conversation\",flags=re.IGNORECASE,string=k[2]):\n",
    "                if len(re.split(string=k[2],pattern='\\s')) == 8:\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    k[2] = str(\"\")\n",
    "#                     print(raw.loc[i,'segTrans'])\n",
    "#                     print('\\n')\n",
    "                    ids.append(i)\n",
    "                    counter += 1\n",
    "                    continue\n",
    "        except:\n",
    "            continue\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"my name is\\s\\S+\",flags=re.IGNORECASE,string=k[2]):\n",
    "                k[2] = re.sub(pattern=\"my name is\\s\\S+\",flags=re.IGNORECASE,string=k[2],repl=\" my name is nameredacted \")\n",
    "                ids.append(i)\n",
    "                counter += 1\n",
    "            continue\n",
    "        except:\n",
    "            continue\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"\\Will@\\S+\",flags=re.IGNORECASE,string=k[2]):\n",
    "                k[2] = re.sub(pattern=\"\\Will@\\S+\",flags=re.IGNORECASE,string=k[2],repl=\" emailill \")\n",
    "                ids.append(i)\n",
    "                counter += 1\n",
    "            continue\n",
    "        except:\n",
    "            continue\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"\\S+@\\S+\",flags=re.IGNORECASE,string=k[2]):\n",
    "                k[2] = re.sub(pattern=\"\\S+@\\S+\",flags=re.IGNORECASE,string=k[2],repl=\" emailredacted \")\n",
    "                ids.append(i)\n",
    "                counter += 1\n",
    "            continue\n",
    "        except:\n",
    "            continue\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 0\n",
    "ids = []\n",
    "for i in raw.index:\n",
    "    for k in raw.loc[i,'segTrans']:\n",
    "        try:\n",
    "            if re.search(pattern=\"\\W\\*{3,}\\W|\\Wxxx{3,}\\W\",flags=re.IGNORECASE,string=k[2]):\n",
    "                k[2] = re.sub(pattern=\"\\W\\*{3,}\\W|\\Wxxx{3,}\\W\",flags=re.IGNORECASE,string=k[2],repl=\" redactedinfo \")\n",
    "                ids.append(i)\n",
    "                counter += 1\n",
    "            continue\n",
    "        except:\n",
    "            continue\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in raw.index:\n",
    "    try:\n",
    "        if raw.loc[i,'segTrans'][0][2] == raw.loc[i,'segTrans'][1][2]:\n",
    "            if raw.loc[i,'segTrans'][0][1] == raw.loc[i,'segTrans'][1][1]:\n",
    "                raw.loc[i,'segTrans'] = raw.loc[i,'segTrans'][1:]\n",
    "    except:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getPatronLines(df):\n",
    "    print('Building Patrons Lines')        \n",
    "    df = df.copy()\n",
    "    for i in df.index:\n",
    "        patronText = str()\n",
    "        for k in df.loc[i,'segTrans']:\n",
    "            if len(k) == 3:\n",
    "                if k[1] == 'patron':\n",
    "                    patronText = patronText + k[2] + ' '\n",
    "        df.loc[i,'PatronTextString'] = patronText\n",
    "        \n",
    "        \n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = getPatronLines(df=raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getStaffLines(df):\n",
    "    print('Building Operator Lines')        \n",
    "    df = df.copy()\n",
    "    for i in df.index:\n",
    "        staffText = str()\n",
    "        for k in df.loc[i,'segTrans']:\n",
    "            if len(k) == 3:\n",
    "                if k[1] == 'staff':\n",
    "                    staffText = staffText + k[2] + ' '\n",
    "        df.loc[i,'StaffTextString'] = staffText\n",
    "        \n",
    "        \n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = getStaffLines(df=raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getAllLines(df):\n",
    "    print('Building All Lines')        \n",
    "    df = df.copy()\n",
    "    for i in df.index:\n",
    "        text = str()\n",
    "        for k in df.loc[i,'segTrans']:\n",
    "            if len(k) == 3:\n",
    "                text = text + k[2] + ' '\n",
    "        df.loc[i,'AllTextString'] = text\n",
    "        \n",
    "        \n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = getAllLines(df=raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw.to_json('full.json')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}