mirror of
https://github.com/chrislgarry/Apollo-11.git
synced 2025-02-21 03:00:30 +00:00
1410 lines
No EOL
55 KiB
Text
1410 lines
No EOL
55 KiB
Text
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Hate Speech - Ethiopia.ipynb",
|
|
"provenance": [],
|
|
"authorship_tag": "ABX9TyMrbg9XCwH/rahVzjhlwI0Y",
|
|
"include_colab_link": true
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "view-in-github",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"<a href=\"https://colab.research.google.com/github/ishaterdal/Apollo-11/blob/master/Hate_Speech_Ethiopia.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "0GZ0Y0S_Nv3m"
|
|
},
|
|
"source": [
|
|
"import pandas as pd"
|
|
],
|
|
"execution_count": 2,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "d-AgFL-yR7eR"
|
|
},
|
|
"source": [
|
|
"terms = pd.read_csv('/content/Terms.csv')"
|
|
],
|
|
"execution_count": 6,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 224
|
|
},
|
|
"id": "pucm09BHTbd3",
|
|
"outputId": "3a2a2aac-de5c-4285-cee3-d8b1cc1c64ef"
|
|
},
|
|
"source": [
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 7,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>List_the_hate_speech_phrase_with_a_comma</th>\n",
|
|
" <th>a_Term</th>\n",
|
|
" <th>a_Term_001</th>\n",
|
|
" <th>a_Term_002</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
|
|
"1 rien ... NaN\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
|
|
"\n",
|
|
"[5 rows x 4 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 7
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "rbIDpJd1WMaN",
|
|
"outputId": "d4a3cda3-2033-4b31-8b86-28408f0e13c2"
|
|
},
|
|
"source": [
|
|
"print(terms)"
|
|
],
|
|
"execution_count": 9,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
" List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
|
|
"1 rien ... NaN\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
|
|
".. ... ... ...\n",
|
|
"477 chamarocka, kholo, nda kangue ... nda kangue\n",
|
|
"478 Amdjoudoul iwacki koudjoumass ... koundjou mass\n",
|
|
"479 Bengue, arabo, pro français, pro russes, moutons, ... Cannibales\n",
|
|
"480 Gangster ... NaN\n",
|
|
"481 - Lawa Lawa\\n- Benguè\\n- Bandaï ... Bandaï\n",
|
|
"\n",
|
|
"[482 rows x 4 columns]\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 175
|
|
},
|
|
"id": "uUU-nueeWSqW",
|
|
"outputId": "31ea9473-7207-478b-cb7d-fac47b8ccf92"
|
|
},
|
|
"source": [
|
|
"terms.describe()"
|
|
],
|
|
"execution_count": 10,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>List_the_hate_speech_phrase_with_a_comma</th>\n",
|
|
" <th>a_Term</th>\n",
|
|
" <th>a_Term_001</th>\n",
|
|
" <th>a_Term_002</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>482</td>\n",
|
|
" <td>482</td>\n",
|
|
" <td>284</td>\n",
|
|
" <td>185</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>481</td>\n",
|
|
" <td>403</td>\n",
|
|
" <td>246</td>\n",
|
|
" <td>168</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>ETRANGER ; RELIGION ; POLITIQUE ;</td>\n",
|
|
" <td>ETRANGER</td>\n",
|
|
" <td>BENGUE</td>\n",
|
|
" <td>BENGUE</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>11</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>5</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" List_the_hate_speech_phrase_with_a_comma a_Term a_Term_001 a_Term_002\n",
|
|
"count 482 482 284 185\n",
|
|
"unique 481 403 246 168\n",
|
|
"top ETRANGER ; RELIGION ; POLITIQUE ; ETRANGER BENGUE BENGUE\n",
|
|
"freq 2 11 5 5"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 10
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "q4cU4Q_hXdsQ"
|
|
},
|
|
"source": [
|
|
"terms.rename(columns={'List_the_hate_speech_phrase_with_a_comma':'terms_list', 'a_Term':'term_1', 'a_Term_001':'term_2','a_Term_002':'term_3'}, inplace=True)"
|
|
],
|
|
"execution_count": 13,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 781
|
|
},
|
|
"id": "bS2p70oZX7cg",
|
|
"outputId": "bbee9b17-d4c8-4562-8528-3e06f1821554"
|
|
},
|
|
"source": [
|
|
"terms.head(20)"
|
|
],
|
|
"execution_count": 20,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ...</td>\n",
|
|
" <td>MO KPA GUI A ALBUNOS</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>RELIGION ; ETHNIQUE , FOOT-BALL</td>\n",
|
|
" <td>RELIGION</td>\n",
|
|
" <td>ETHNIQUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>VOYOUTISME ; IMPURE ; MECREANT ;</td>\n",
|
|
" <td>VOYOUTISME</td>\n",
|
|
" <td>IMPURE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S...</td>\n",
|
|
" <td>kPANDA</td>\n",
|
|
" <td>LE TI BONGO T I MO OKO SO KOUE</td>\n",
|
|
" <td>MO SO MO YEKE NA GNE</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>FAUSSEUR ; HOMO SEXUEL ; BARBARIE ;</td>\n",
|
|
" <td>FAUSSEUR</td>\n",
|
|
" <td>HOMO SEXUEL</td>\n",
|
|
" <td>BARBARIE</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>CONSIDERER LES MUSULMANS COMME LES TERRORISTES</td>\n",
|
|
" <td>IDIOT</td>\n",
|
|
" <td>BANDAYE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>11</th>\n",
|
|
" <td>BRAQUEUR ; DESORDONNE ; FOU ;</td>\n",
|
|
" <td>BRAQUEUR</td>\n",
|
|
" <td>DESORDONNE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI...</td>\n",
|
|
" <td>EGALITE ENTRE LES SEXES</td>\n",
|
|
" <td>DROIT DE L'ENFANT</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>CRFISE DE COVID 19 ; CRISE ECONOMIQUE ;</td>\n",
|
|
" <td>GROUPE DE BANDITS</td>\n",
|
|
" <td>ILLETRE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>14</th>\n",
|
|
" <td>Balaka, Seleka, a baba so</td>\n",
|
|
" <td>Balaka</td>\n",
|
|
" <td>Seleka</td>\n",
|
|
" <td>Ti ala a baba so</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>15</th>\n",
|
|
" <td>Gagango, arabou, soukoula bi</td>\n",
|
|
" <td>Gagango</td>\n",
|
|
" <td>Arabou</td>\n",
|
|
" <td>Soukoula mbi</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>16</th>\n",
|
|
" <td>SARANGA ; BORDEL</td>\n",
|
|
" <td>SARANGA</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>17</th>\n",
|
|
" <td>A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA</td>\n",
|
|
" <td>A GA GANGO</td>\n",
|
|
" <td>ALA GA LAWA</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>18</th>\n",
|
|
" <td>ESCROC ; IDIOT ; I MOU MO</td>\n",
|
|
" <td>ESCROC</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>19</th>\n",
|
|
" <td>L'insulte,la division, le racisme, l'ethnocent...</td>\n",
|
|
" <td>Idiot</td>\n",
|
|
" <td>Bon à rien!</td>\n",
|
|
" <td>Âne</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... term_3\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
|
|
"1 rien ... NaN\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
|
|
"5 MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ... ... NaN\n",
|
|
"6 RELIGION ; ETHNIQUE , FOOT-BALL ... NaN\n",
|
|
"7 VOYOUTISME ; IMPURE ; MECREANT ; ... NaN\n",
|
|
"8 KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S... ... MO SO MO YEKE NA GNE\n",
|
|
"9 FAUSSEUR ; HOMO SEXUEL ; BARBARIE ; ... BARBARIE\n",
|
|
"10 CONSIDERER LES MUSULMANS COMME LES TERRORISTES ... NaN\n",
|
|
"11 BRAQUEUR ; DESORDONNE ; FOU ; ... NaN\n",
|
|
"12 EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI... ... NaN\n",
|
|
"13 CRFISE DE COVID 19 ; CRISE ECONOMIQUE ; ... NaN\n",
|
|
"14 Balaka, Seleka, a baba so ... Ti ala a baba so\n",
|
|
"15 Gagango, arabou, soukoula bi ... Soukoula mbi\n",
|
|
"16 SARANGA ; BORDEL ... NaN\n",
|
|
"17 A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA ... NaN\n",
|
|
"18 ESCROC ; IDIOT ; I MOU MO ... NaN\n",
|
|
"19 L'insulte,la division, le racisme, l'ethnocent... ... Âne\n",
|
|
"\n",
|
|
"[20 rows x 4 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 20
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "xrvxLq8naX8x",
|
|
"outputId": "c3136ea0-625d-4831-9e09-b3c83b00a35d"
|
|
},
|
|
"source": [
|
|
"terms['term_1'].value_counts()"
|
|
],
|
|
"execution_count": 19,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": [
|
|
"ETRANGER 11\n",
|
|
"ETHNIQUE 6\n",
|
|
"A MBO TI TOUADERA 6\n",
|
|
"MBORORO 5\n",
|
|
"RELIGION 4\n",
|
|
" ..\n",
|
|
"FOUNGO TERE 1\n",
|
|
"GA GA NGON 1\n",
|
|
"ALA A BABA SO LA ? ; 1\n",
|
|
"JE SUIS FACA ; 1\n",
|
|
"GROUPE DE BANDITS 1\n",
|
|
"Name: term_1, Length: 403, dtype: int64"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 19
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "buyvQpOJh322"
|
|
},
|
|
"source": [
|
|
"from sklearn import preprocessing"
|
|
],
|
|
"execution_count": 22,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "jSdlaj2L1DY1"
|
|
},
|
|
"source": [
|
|
"section2 = terms['terms_list']"
|
|
],
|
|
"execution_count": 24,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "aX3cBB_51S3_",
|
|
"outputId": "f30015d2-c884-4330-9734-08f1393d92ef"
|
|
},
|
|
"source": [
|
|
"section2"
|
|
],
|
|
"execution_count": 25,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": [
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n",
|
|
"1 rien\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ;\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ?\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n",
|
|
" ... \n",
|
|
"477 chamarocka, kholo, nda kangue\n",
|
|
"478 Amdjoudoul iwacki koudjoumass\n",
|
|
"479 Bengue, arabo, pro français, pro russes, moutons,\n",
|
|
"480 Gangster\n",
|
|
"481 - Lawa Lawa\\n- Benguè\\n- Bandaï\n",
|
|
"Name: terms_list, Length: 482, dtype: object"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 25
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "ot60Lmew6qp0",
|
|
"outputId": "3735a9c3-5c4e-4489-caf0-f8ab54c950b2"
|
|
},
|
|
"source": [
|
|
"!pip install contractions\n",
|
|
"import contractions"
|
|
],
|
|
"execution_count": 26,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Collecting contractions\n",
|
|
" Downloading https://files.pythonhosted.org/packages/ce/ad/d1c685967945a04f8596128b15a1ab56c51488f53312e953341af6ff22d1/contractions-0.0.43-py2.py3-none-any.whl\n",
|
|
"Collecting textsearch\n",
|
|
" Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl\n",
|
|
"Collecting pyahocorasick\n",
|
|
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n",
|
|
"\u001b[K |████████████████████████████████| 317kB 5.9MB/s \n",
|
|
"\u001b[?25hCollecting Unidecode\n",
|
|
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n",
|
|
"\u001b[K |████████████████████████████████| 245kB 41.9MB/s \n",
|
|
"\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n",
|
|
" Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
|
" Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81707 sha256=0cb88880bcc215b7a3749858fd619d028c638f276938a7ffe08d22897d33c4d8\n",
|
|
" Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n",
|
|
"Successfully built pyahocorasick\n",
|
|
"Installing collected packages: pyahocorasick, Unidecode, textsearch, contractions\n",
|
|
"Successfully installed Unidecode-1.1.1 contractions-0.0.43 pyahocorasick-1.4.0 textsearch-0.0.17\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "yFq0P9PHFMFr"
|
|
},
|
|
"source": [
|
|
"terms['no_contract'] = section2.apply(lambda x: [contractions.fix(word) for word in x.split()])"
|
|
],
|
|
"execution_count": 27,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 276
|
|
},
|
|
"id": "v6Q_V3rWFoNA",
|
|
"outputId": "0d3d3388-87ad-4d37-fd94-30a9137b94e4"
|
|
},
|
|
"source": [
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 28,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" <th>no_contract</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... no_contract\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]\n",
|
|
"1 rien ... [rien]\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...\n",
|
|
"\n",
|
|
"[5 rows x 5 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 28
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 293
|
|
},
|
|
"id": "yCUL-zsLQ4-O",
|
|
"outputId": "0dbc0192-3ead-40d9-f81d-00d575672d0a"
|
|
},
|
|
"source": [
|
|
"terms[\"msg_str\"] = [' '.join(map(str, l)) for l in terms['no_contract']]\n",
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 36,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" <th>no_contract</th>\n",
|
|
" <th>msg_str</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>rien</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... msg_str\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n",
|
|
"1 rien ... rien\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... HAINE ; RELIGION ; ETHNIQUE ;\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... TETUE ; VOYOU ; MO YINGA MBI ?\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n",
|
|
"\n",
|
|
"[5 rows x 6 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 36
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "cEwARPq_GG08",
|
|
"outputId": "f568dc5f-a4f2-44c4-9964-89c0e712fb8e"
|
|
},
|
|
"source": [
|
|
"import nltk\n",
|
|
"nltk.download('punkt')\n",
|
|
"from nltk.tokenize import word_tokenize"
|
|
],
|
|
"execution_count": 30,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
|
|
"[nltk_data] Package punkt is already up-to-date!\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "18uESl1iP1RL"
|
|
},
|
|
"source": [
|
|
"text = \"Hi, I would like to tokenize this sentence\""
|
|
],
|
|
"execution_count": 31,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "MF66YtCuP5YZ",
|
|
"outputId": "72c38ef0-513a-435a-8950-80de388ea66c"
|
|
},
|
|
"source": [
|
|
"print(word_tokenize(text))"
|
|
],
|
|
"execution_count": 32,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['Hi', ',', 'I', 'would', 'like', 'to', 'tokenize', 'this', 'sentence']\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "tAsC9yEvQNNt"
|
|
},
|
|
"source": [
|
|
"terms['tokenized'] = terms['msg_str'].apply(word_tokenize)"
|
|
],
|
|
"execution_count": 38,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 328
|
|
},
|
|
"id": "YbkjpCyiRRNt",
|
|
"outputId": "782297b9-595b-4aba-bb89-ceea73fdc3ff"
|
|
},
|
|
"source": [
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 39,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" <th>no_contract</th>\n",
|
|
" <th>msg_str</th>\n",
|
|
" <th>tokenized</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... tokenized\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...\n",
|
|
"1 rien ... [rien]\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...\n",
|
|
"\n",
|
|
"[5 rows x 7 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 39
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 380
|
|
},
|
|
"id": "_w9FQIj9SrTG",
|
|
"outputId": "dfef973c-bfcb-4c76-d891-c91e4c0c4bd7"
|
|
},
|
|
"source": [
|
|
"terms['lower'] = terms['tokenized'].apply(lambda x: [word.lower() for word in x])\n",
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 41,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" <th>no_contract</th>\n",
|
|
" <th>msg_str</th>\n",
|
|
" <th>tokenized</th>\n",
|
|
" <th>lower</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
|
|
" <td>[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>[haine, ;, religion, ;, ethnique, ;]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>[tetue, ;, voyou, ;, mo, yinga, mbi, ?]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
|
|
" <td>[les, gbakas, mandja, sont, trop, egoistes, ;,...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... lower\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...\n",
|
|
"1 rien ... [rien]\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, ;, religion, ;, ethnique, ;]\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, ;, voyou, ;, mo, yinga, mbi, ?]\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, ;,...\n",
|
|
"\n",
|
|
"[5 rows x 8 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 41
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 380
|
|
},
|
|
"id": "GXCYOG-XTYJy",
|
|
"outputId": "5936953c-3c9e-49a9-dcda-3418e1ce939c"
|
|
},
|
|
"source": [
|
|
"import string\n",
|
|
"punc = string.punctuation\n",
|
|
"terms['no_punc'] = terms['lower'].apply(lambda x: [word for word in x if word not in punc])\n",
|
|
"terms.head()"
|
|
],
|
|
"execution_count": 42,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>terms_list</th>\n",
|
|
" <th>term_1</th>\n",
|
|
" <th>term_2</th>\n",
|
|
" <th>term_3</th>\n",
|
|
" <th>no_contract</th>\n",
|
|
" <th>msg_str</th>\n",
|
|
" <th>tokenized</th>\n",
|
|
" <th>lower</th>\n",
|
|
" <th>no_punc</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>SARANGA TI WALI</td>\n",
|
|
" <td>TAXI-MOTO</td>\n",
|
|
" <td>VOYOUX</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
|
|
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
|
|
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
|
|
" <td>[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...</td>\n",
|
|
" <td>[saranga, ti, wali, taxi, -moto, voyou]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>rien</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" <td>[rien]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>HAINE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
|
|
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
|
|
" <td>[haine, ;, religion, ;, ethnique, ;]</td>\n",
|
|
" <td>[haine, religion, ethnique]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>TETUE</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
|
|
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
|
|
" <td>[tetue, ;, voyou, ;, mo, yinga, mbi, ?]</td>\n",
|
|
" <td>[tetue, voyou, mo, yinga, mbi]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
|
|
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
|
|
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
|
|
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
|
|
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
|
|
" <td>[les, gbakas, mandja, sont, trop, egoistes, ;,...</td>\n",
|
|
" <td>[les, gbakas, mandja, sont, trop, egoistes, le...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" terms_list ... no_punc\n",
|
|
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, taxi, -moto, voyou]\n",
|
|
"1 rien ... [rien]\n",
|
|
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, religion, ethnique]\n",
|
|
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, voyou, mo, yinga, mbi]\n",
|
|
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, le...\n",
|
|
"\n",
|
|
"[5 rows x 9 columns]"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 42
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "UYk5KcIZUJAZ",
|
|
"outputId": "85b4298c-44e8-4714-a51b-fae2b2afecaf"
|
|
},
|
|
"source": [
|
|
"terms.terms_list.str.split(expand=True).stack().value_counts()[:50]"
|
|
],
|
|
"execution_count": 55,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": [
|
|
"; 943\n",
|
|
"TI 274\n",
|
|
"MO 188\n",
|
|
"A 174\n",
|
|
"ZO 102\n",
|
|
"SO 88\n",
|
|
"ALA 78\n",
|
|
"LA 77\n",
|
|
"LO 69\n",
|
|
"BA 49\n",
|
|
"NA 46\n",
|
|
"GA 44\n",
|
|
"? 44\n",
|
|
"MBI 38\n",
|
|
"YEKE 35\n",
|
|
", 34\n",
|
|
"BOUBA 33\n",
|
|
"LES 31\n",
|
|
"WALI 29\n",
|
|
"APE 28\n",
|
|
"TU 28\n",
|
|
"I 26\n",
|
|
"BENGUE 26\n",
|
|
"MAMA 24\n",
|
|
"AWE 22\n",
|
|
"LAWA 20\n",
|
|
"DE 19\n",
|
|
"ARABO 19\n",
|
|
"PINDOUNGOU 18\n",
|
|
"DES 18\n",
|
|
"RELIGION 17\n",
|
|
"TOUADERA 17\n",
|
|
"MBORORO 17\n",
|
|
"ETHNIQUE 16\n",
|
|
"GANGO 16\n",
|
|
"ETRANGER 16\n",
|
|
"YA 15\n",
|
|
"LE 15\n",
|
|
"KATA 15\n",
|
|
"MBO 14\n",
|
|
"GBAYA 14\n",
|
|
"ME 13\n",
|
|
"ES 13\n",
|
|
"BANDA 13\n",
|
|
"TA 13\n",
|
|
"POLITIQUE 13\n",
|
|
"INGA 13\n",
|
|
"KE 13\n",
|
|
"SELEKA 13\n",
|
|
"ANDE 12\n",
|
|
"dtype: int64"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 55
|
|
}
|
|
]
|
|
}
|
|
]
|
|
} |