Apollo-Guidance-Computer/Hate_Speech_Ethiopia.ipynb
2020-12-09 21:52:18 -05:00

1410 lines
No EOL
55 KiB
Text

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Hate Speech - Ethiopia.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyMrbg9XCwH/rahVzjhlwI0Y",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/ishaterdal/Apollo-11/blob/master/Hate_Speech_Ethiopia.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "0GZ0Y0S_Nv3m"
},
"source": [
"import pandas as pd"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "d-AgFL-yR7eR"
},
"source": [
"terms = pd.read_csv('/content/Terms.csv')"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"id": "pucm09BHTbd3",
"outputId": "3a2a2aac-de5c-4285-cee3-d8b1cc1c64ef"
},
"source": [
"terms.head()"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>List_the_hate_speech_phrase_with_a_comma</th>\n",
" <th>a_Term</th>\n",
" <th>a_Term_001</th>\n",
" <th>a_Term_002</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
"1 rien ... NaN\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
"\n",
"[5 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "rbIDpJd1WMaN",
"outputId": "d4a3cda3-2033-4b31-8b86-28408f0e13c2"
},
"source": [
"print(terms)"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
" List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
"1 rien ... NaN\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
".. ... ... ...\n",
"477 chamarocka, kholo, nda kangue ... nda kangue\n",
"478 Amdjoudoul iwacki koudjoumass ... koundjou mass\n",
"479 Bengue, arabo, pro français, pro russes, moutons, ... Cannibales\n",
"480 Gangster ... NaN\n",
"481 - Lawa Lawa\\n- Benguè\\n- Bandaï ... Bandaï\n",
"\n",
"[482 rows x 4 columns]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 175
},
"id": "uUU-nueeWSqW",
"outputId": "31ea9473-7207-478b-cb7d-fac47b8ccf92"
},
"source": [
"terms.describe()"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>List_the_hate_speech_phrase_with_a_comma</th>\n",
" <th>a_Term</th>\n",
" <th>a_Term_001</th>\n",
" <th>a_Term_002</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>482</td>\n",
" <td>482</td>\n",
" <td>284</td>\n",
" <td>185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>481</td>\n",
" <td>403</td>\n",
" <td>246</td>\n",
" <td>168</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>ETRANGER ; RELIGION ; POLITIQUE ;</td>\n",
" <td>ETRANGER</td>\n",
" <td>BENGUE</td>\n",
" <td>BENGUE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>2</td>\n",
" <td>11</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" List_the_hate_speech_phrase_with_a_comma a_Term a_Term_001 a_Term_002\n",
"count 482 482 284 185\n",
"unique 481 403 246 168\n",
"top ETRANGER ; RELIGION ; POLITIQUE ; ETRANGER BENGUE BENGUE\n",
"freq 2 11 5 5"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "q4cU4Q_hXdsQ"
},
"source": [
"terms.rename(columns={'List_the_hate_speech_phrase_with_a_comma':'terms_list', 'a_Term':'term_1', 'a_Term_001':'term_2','a_Term_002':'term_3'}, inplace=True)"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 781
},
"id": "bS2p70oZX7cg",
"outputId": "bbee9b17-d4c8-4562-8528-3e06f1821554"
},
"source": [
"terms.head(20)"
],
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ...</td>\n",
" <td>MO KPA GUI A ALBUNOS</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>RELIGION ; ETHNIQUE , FOOT-BALL</td>\n",
" <td>RELIGION</td>\n",
" <td>ETHNIQUE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>VOYOUTISME ; IMPURE ; MECREANT ;</td>\n",
" <td>VOYOUTISME</td>\n",
" <td>IMPURE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S...</td>\n",
" <td>kPANDA</td>\n",
" <td>LE TI BONGO T I MO OKO SO KOUE</td>\n",
" <td>MO SO MO YEKE NA GNE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>FAUSSEUR ; HOMO SEXUEL ; BARBARIE ;</td>\n",
" <td>FAUSSEUR</td>\n",
" <td>HOMO SEXUEL</td>\n",
" <td>BARBARIE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>CONSIDERER LES MUSULMANS COMME LES TERRORISTES</td>\n",
" <td>IDIOT</td>\n",
" <td>BANDAYE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>BRAQUEUR ; DESORDONNE ; FOU ;</td>\n",
" <td>BRAQUEUR</td>\n",
" <td>DESORDONNE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI...</td>\n",
" <td>EGALITE ENTRE LES SEXES</td>\n",
" <td>DROIT DE L'ENFANT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>CRFISE DE COVID 19 ; CRISE ECONOMIQUE ;</td>\n",
" <td>GROUPE DE BANDITS</td>\n",
" <td>ILLETRE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Balaka, Seleka, a baba so</td>\n",
" <td>Balaka</td>\n",
" <td>Seleka</td>\n",
" <td>Ti ala a baba so</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Gagango, arabou, soukoula bi</td>\n",
" <td>Gagango</td>\n",
" <td>Arabou</td>\n",
" <td>Soukoula mbi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>SARANGA ; BORDEL</td>\n",
" <td>SARANGA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA</td>\n",
" <td>A GA GANGO</td>\n",
" <td>ALA GA LAWA</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>ESCROC ; IDIOT ; I MOU MO</td>\n",
" <td>ESCROC</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>L'insulte,la division, le racisme, l'ethnocent...</td>\n",
" <td>Idiot</td>\n",
" <td>Bon à rien!</td>\n",
" <td>Âne</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... term_3\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n",
"1 rien ... NaN\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n",
"5 MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ... ... NaN\n",
"6 RELIGION ; ETHNIQUE , FOOT-BALL ... NaN\n",
"7 VOYOUTISME ; IMPURE ; MECREANT ; ... NaN\n",
"8 KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S... ... MO SO MO YEKE NA GNE\n",
"9 FAUSSEUR ; HOMO SEXUEL ; BARBARIE ; ... BARBARIE\n",
"10 CONSIDERER LES MUSULMANS COMME LES TERRORISTES ... NaN\n",
"11 BRAQUEUR ; DESORDONNE ; FOU ; ... NaN\n",
"12 EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI... ... NaN\n",
"13 CRFISE DE COVID 19 ; CRISE ECONOMIQUE ; ... NaN\n",
"14 Balaka, Seleka, a baba so ... Ti ala a baba so\n",
"15 Gagango, arabou, soukoula bi ... Soukoula mbi\n",
"16 SARANGA ; BORDEL ... NaN\n",
"17 A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA ... NaN\n",
"18 ESCROC ; IDIOT ; I MOU MO ... NaN\n",
"19 L'insulte,la division, le racisme, l'ethnocent... ... Âne\n",
"\n",
"[20 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xrvxLq8naX8x",
"outputId": "c3136ea0-625d-4831-9e09-b3c83b00a35d"
},
"source": [
"terms['term_1'].value_counts()"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ETRANGER 11\n",
"ETHNIQUE 6\n",
"A MBO TI TOUADERA 6\n",
"MBORORO 5\n",
"RELIGION 4\n",
" ..\n",
"FOUNGO TERE 1\n",
"GA GA NGON 1\n",
"ALA A BABA SO LA ? ; 1\n",
"JE SUIS FACA ; 1\n",
"GROUPE DE BANDITS 1\n",
"Name: term_1, Length: 403, dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "buyvQpOJh322"
},
"source": [
"from sklearn import preprocessing"
],
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jSdlaj2L1DY1"
},
"source": [
"section2 = terms['terms_list']"
],
"execution_count": 24,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aX3cBB_51S3_",
"outputId": "f30015d2-c884-4330-9734-08f1393d92ef"
},
"source": [
"section2"
],
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n",
"1 rien\n",
"2 HAINE ; RELIGION ; ETHNIQUE ;\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ?\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n",
" ... \n",
"477 chamarocka, kholo, nda kangue\n",
"478 Amdjoudoul iwacki koudjoumass\n",
"479 Bengue, arabo, pro français, pro russes, moutons,\n",
"480 Gangster\n",
"481 - Lawa Lawa\\n- Benguè\\n- Bandaï\n",
"Name: terms_list, Length: 482, dtype: object"
]
},
"metadata": {
"tags": []
},
"execution_count": 25
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ot60Lmew6qp0",
"outputId": "3735a9c3-5c4e-4489-caf0-f8ab54c950b2"
},
"source": [
"!pip install contractions\n",
"import contractions"
],
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting contractions\n",
" Downloading https://files.pythonhosted.org/packages/ce/ad/d1c685967945a04f8596128b15a1ab56c51488f53312e953341af6ff22d1/contractions-0.0.43-py2.py3-none-any.whl\n",
"Collecting textsearch\n",
" Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl\n",
"Collecting pyahocorasick\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n",
"\u001b[K |████████████████████████████████| 317kB 5.9MB/s \n",
"\u001b[?25hCollecting Unidecode\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n",
"\u001b[K |████████████████████████████████| 245kB 41.9MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n",
" Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81707 sha256=0cb88880bcc215b7a3749858fd619d028c638f276938a7ffe08d22897d33c4d8\n",
" Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n",
"Successfully built pyahocorasick\n",
"Installing collected packages: pyahocorasick, Unidecode, textsearch, contractions\n",
"Successfully installed Unidecode-1.1.1 contractions-0.0.43 pyahocorasick-1.4.0 textsearch-0.0.17\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "yFq0P9PHFMFr"
},
"source": [
"terms['no_contract'] = section2.apply(lambda x: [contractions.fix(word) for word in x.split()])"
],
"execution_count": 27,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 276
},
"id": "v6Q_V3rWFoNA",
"outputId": "0d3d3388-87ad-4d37-fd94-30a9137b94e4"
},
"source": [
"terms.head()"
],
"execution_count": 28,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" <th>no_contract</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[rien]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... no_contract\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]\n",
"1 rien ... [rien]\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...\n",
"\n",
"[5 rows x 5 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 293
},
"id": "yCUL-zsLQ4-O",
"outputId": "0dbc0192-3ead-40d9-f81d-00d575672d0a"
},
"source": [
"terms[\"msg_str\"] = [' '.join(map(str, l)) for l in terms['no_contract']]\n",
"terms.head()"
],
"execution_count": 36,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" <th>no_contract</th>\n",
" <th>msg_str</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[rien]</td>\n",
" <td>rien</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... msg_str\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n",
"1 rien ... rien\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... HAINE ; RELIGION ; ETHNIQUE ;\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... TETUE ; VOYOU ; MO YINGA MBI ?\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n",
"\n",
"[5 rows x 6 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 36
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cEwARPq_GG08",
"outputId": "f568dc5f-a4f2-44c4-9964-89c0e712fb8e"
},
"source": [
"import nltk\n",
"nltk.download('punkt')\n",
"from nltk.tokenize import word_tokenize"
],
"execution_count": 30,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "18uESl1iP1RL"
},
"source": [
"text = \"Hi, I would like to tokenize this sentence\""
],
"execution_count": 31,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MF66YtCuP5YZ",
"outputId": "72c38ef0-513a-435a-8950-80de388ea66c"
},
"source": [
"print(word_tokenize(text))"
],
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"text": [
"['Hi', ',', 'I', 'would', 'like', 'to', 'tokenize', 'this', 'sentence']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "tAsC9yEvQNNt"
},
"source": [
"terms['tokenized'] = terms['msg_str'].apply(word_tokenize)"
],
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 328
},
"id": "YbkjpCyiRRNt",
"outputId": "782297b9-595b-4aba-bb89-ceea73fdc3ff"
},
"source": [
"terms.head()"
],
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" <th>no_contract</th>\n",
" <th>msg_str</th>\n",
" <th>tokenized</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[rien]</td>\n",
" <td>rien</td>\n",
" <td>[rien]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... tokenized\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...\n",
"1 rien ... [rien]\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...\n",
"\n",
"[5 rows x 7 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 39
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 380
},
"id": "_w9FQIj9SrTG",
"outputId": "dfef973c-bfcb-4c76-d891-c91e4c0c4bd7"
},
"source": [
"terms['lower'] = terms['tokenized'].apply(lambda x: [word.lower() for word in x])\n",
"terms.head()"
],
"execution_count": 41,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" <th>no_contract</th>\n",
" <th>msg_str</th>\n",
" <th>tokenized</th>\n",
" <th>lower</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
" <td>[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[rien]</td>\n",
" <td>rien</td>\n",
" <td>[rien]</td>\n",
" <td>[rien]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>[haine, ;, religion, ;, ethnique, ;]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>[tetue, ;, voyou, ;, mo, yinga, mbi, ?]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
" <td>[les, gbakas, mandja, sont, trop, egoistes, ;,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... lower\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...\n",
"1 rien ... [rien]\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, ;, religion, ;, ethnique, ;]\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, ;, voyou, ;, mo, yinga, mbi, ?]\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, ;,...\n",
"\n",
"[5 rows x 8 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 41
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 380
},
"id": "GXCYOG-XTYJy",
"outputId": "5936953c-3c9e-49a9-dcda-3418e1ce939c"
},
"source": [
"import string\n",
"punc = string.punctuation\n",
"terms['no_punc'] = terms['lower'].apply(lambda x: [word for word in x if word not in punc])\n",
"terms.head()"
],
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>terms_list</th>\n",
" <th>term_1</th>\n",
" <th>term_2</th>\n",
" <th>term_3</th>\n",
" <th>no_contract</th>\n",
" <th>msg_str</th>\n",
" <th>tokenized</th>\n",
" <th>lower</th>\n",
" <th>no_punc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>SARANGA TI WALI</td>\n",
" <td>TAXI-MOTO</td>\n",
" <td>VOYOUX</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]</td>\n",
" <td>SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,</td>\n",
" <td>[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...</td>\n",
" <td>[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...</td>\n",
" <td>[saranga, ti, wali, taxi, -moto, voyou]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rien</td>\n",
" <td>rien</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[rien]</td>\n",
" <td>rien</td>\n",
" <td>[rien]</td>\n",
" <td>[rien]</td>\n",
" <td>[rien]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>HAINE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>HAINE ; RELIGION ; ETHNIQUE ;</td>\n",
" <td>[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]</td>\n",
" <td>[haine, ;, religion, ;, ethnique, ;]</td>\n",
" <td>[haine, religion, ethnique]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>TETUE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>TETUE ; VOYOU ; MO YINGA MBI ?</td>\n",
" <td>[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]</td>\n",
" <td>[tetue, ;, voyou, ;, mo, yinga, mbi, ?]</td>\n",
" <td>[tetue, voyou, mo, yinga, mbi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES</td>\n",
" <td>LES YAKOMAS SONT DES ORGUEILLEUX</td>\n",
" <td>LES MANDJA SONT DES GRANDS VOLEURS ;</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...</td>\n",
" <td>LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...</td>\n",
" <td>[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...</td>\n",
" <td>[les, gbakas, mandja, sont, trop, egoistes, ;,...</td>\n",
" <td>[les, gbakas, mandja, sont, trop, egoistes, le...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" terms_list ... no_punc\n",
"0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, taxi, -moto, voyou]\n",
"1 rien ... [rien]\n",
"2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, religion, ethnique]\n",
"3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, voyou, mo, yinga, mbi]\n",
"4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, le...\n",
"\n",
"[5 rows x 9 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UYk5KcIZUJAZ",
"outputId": "85b4298c-44e8-4714-a51b-fae2b2afecaf"
},
"source": [
"terms.terms_list.str.split(expand=True).stack().value_counts()[:50]"
],
"execution_count": 55,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"; 943\n",
"TI 274\n",
"MO 188\n",
"A 174\n",
"ZO 102\n",
"SO 88\n",
"ALA 78\n",
"LA 77\n",
"LO 69\n",
"BA 49\n",
"NA 46\n",
"GA 44\n",
"? 44\n",
"MBI 38\n",
"YEKE 35\n",
", 34\n",
"BOUBA 33\n",
"LES 31\n",
"WALI 29\n",
"APE 28\n",
"TU 28\n",
"I 26\n",
"BENGUE 26\n",
"MAMA 24\n",
"AWE 22\n",
"LAWA 20\n",
"DE 19\n",
"ARABO 19\n",
"PINDOUNGOU 18\n",
"DES 18\n",
"RELIGION 17\n",
"TOUADERA 17\n",
"MBORORO 17\n",
"ETHNIQUE 16\n",
"GANGO 16\n",
"ETRANGER 16\n",
"YA 15\n",
"LE 15\n",
"KATA 15\n",
"MBO 14\n",
"GBAYA 14\n",
"ME 13\n",
"ES 13\n",
"BANDA 13\n",
"TA 13\n",
"POLITIQUE 13\n",
"INGA 13\n",
"KE 13\n",
"SELEKA 13\n",
"ANDE 12\n",
"dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 55
}
]
}
]
}