{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Hate Speech - Ethiopia.ipynb", "provenance": [], "authorship_tag": "ABX9TyMrbg9XCwH/rahVzjhlwI0Y", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "metadata": { "id": "0GZ0Y0S_Nv3m" }, "source": [ "import pandas as pd" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "d-AgFL-yR7eR" }, "source": [ "terms = pd.read_csv('/content/Terms.csv')" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, "id": "pucm09BHTbd3", "outputId": "3a2a2aac-de5c-4285-cee3-d8b1cc1c64ef" }, "source": [ "terms.head()" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
List_the_hate_speech_phrase_with_a_commaa_Terma_Term_001a_Term_002
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX
1rienrienNaNNaN
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;
\n", "
" ], "text/plain": [ " List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", "1 rien ... NaN\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", "\n", "[5 rows x 4 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rbIDpJd1WMaN", "outputId": "d4a3cda3-2033-4b31-8b86-28408f0e13c2" }, "source": [ "print(terms)" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ " List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", "1 rien ... NaN\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", ".. ... ... ...\n", "477 chamarocka, kholo, nda kangue ... nda kangue\n", "478 Amdjoudoul iwacki koudjoumass ... koundjou mass\n", "479 Bengue, arabo, pro français, pro russes, moutons, ... Cannibales\n", "480 Gangster ... NaN\n", "481 - Lawa Lawa\\n- Benguè\\n- Bandaï ... Bandaï\n", "\n", "[482 rows x 4 columns]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 175 }, "id": "uUU-nueeWSqW", "outputId": "31ea9473-7207-478b-cb7d-fac47b8ccf92" }, "source": [ "terms.describe()" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
List_the_hate_speech_phrase_with_a_commaa_Terma_Term_001a_Term_002
count482482284185
unique481403246168
topETRANGER ; RELIGION ; POLITIQUE ;ETRANGERBENGUEBENGUE
freq21155
\n", "
" ], "text/plain": [ " List_the_hate_speech_phrase_with_a_comma a_Term a_Term_001 a_Term_002\n", "count 482 482 284 185\n", "unique 481 403 246 168\n", "top ETRANGER ; RELIGION ; POLITIQUE ; ETRANGER BENGUE BENGUE\n", "freq 2 11 5 5" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "id": "q4cU4Q_hXdsQ" }, "source": [ "terms.rename(columns={'List_the_hate_speech_phrase_with_a_comma':'terms_list', 'a_Term':'term_1', 'a_Term_001':'term_2','a_Term_002':'term_3'}, inplace=True)" ], "execution_count": 13, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 781 }, "id": "bS2p70oZX7cg", "outputId": "bbee9b17-d4c8-4562-8528-3e06f1821554" }, "source": [ "terms.head(20)" ], "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX
1rienrienNaNNaN
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;
5MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ...MO KPA GUI A ALBUNOSNaNNaN
6RELIGION ; ETHNIQUE , FOOT-BALLRELIGIONETHNIQUENaN
7VOYOUTISME ; IMPURE ; MECREANT ;VOYOUTISMEIMPURENaN
8KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S...kPANDALE TI BONGO T I MO OKO SO KOUEMO SO MO YEKE NA GNE
9FAUSSEUR ; HOMO SEXUEL ; BARBARIE ;FAUSSEURHOMO SEXUELBARBARIE
10CONSIDERER LES MUSULMANS COMME LES TERRORISTESIDIOTBANDAYENaN
11BRAQUEUR ; DESORDONNE ; FOU ;BRAQUEURDESORDONNENaN
12EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI...EGALITE ENTRE LES SEXESDROIT DE L'ENFANTNaN
13CRFISE DE COVID 19 ; CRISE ECONOMIQUE ;GROUPE DE BANDITSILLETRENaN
14Balaka, Seleka, a baba soBalakaSelekaTi ala a baba so
15Gagango, arabou, soukoula biGagangoArabouSoukoula mbi
16SARANGA ; BORDELSARANGANaNNaN
17A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LAA GA GANGOALA GA LAWANaN
18ESCROC ; IDIOT ; I MOU MOESCROCNaNNaN
19L'insulte,la division, le racisme, l'ethnocent...IdiotBon à rien!Âne
\n", "
" ], "text/plain": [ " terms_list ... term_3\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", "1 rien ... NaN\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", "5 MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ... ... NaN\n", "6 RELIGION ; ETHNIQUE , FOOT-BALL ... NaN\n", "7 VOYOUTISME ; IMPURE ; MECREANT ; ... NaN\n", "8 KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S... ... MO SO MO YEKE NA GNE\n", "9 FAUSSEUR ; HOMO SEXUEL ; BARBARIE ; ... BARBARIE\n", "10 CONSIDERER LES MUSULMANS COMME LES TERRORISTES ... NaN\n", "11 BRAQUEUR ; DESORDONNE ; FOU ; ... NaN\n", "12 EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI... ... NaN\n", "13 CRFISE DE COVID 19 ; CRISE ECONOMIQUE ; ... NaN\n", "14 Balaka, Seleka, a baba so ... Ti ala a baba so\n", "15 Gagango, arabou, soukoula bi ... Soukoula mbi\n", "16 SARANGA ; BORDEL ... NaN\n", "17 A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA ... NaN\n", "18 ESCROC ; IDIOT ; I MOU MO ... NaN\n", "19 L'insulte,la division, le racisme, l'ethnocent... ... Âne\n", "\n", "[20 rows x 4 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 20 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xrvxLq8naX8x", "outputId": "c3136ea0-625d-4831-9e09-b3c83b00a35d" }, "source": [ "terms['term_1'].value_counts()" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ETRANGER 11\n", "ETHNIQUE 6\n", "A MBO TI TOUADERA 6\n", "MBORORO 5\n", "RELIGION 4\n", " ..\n", "FOUNGO TERE 1\n", "GA GA NGON 1\n", "ALA A BABA SO LA ? ; 1\n", "JE SUIS FACA ; 1\n", "GROUPE DE BANDITS 1\n", "Name: term_1, Length: 403, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "id": "buyvQpOJh322" }, "source": [ "from sklearn import preprocessing" ], "execution_count": 22, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jSdlaj2L1DY1" }, "source": [ "section2 = terms['terms_list']" ], "execution_count": 24, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aX3cBB_51S3_", "outputId": "f30015d2-c884-4330-9734-08f1393d92ef" }, "source": [ "section2" ], "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n", "1 rien\n", "2 HAINE ; RELIGION ; ETHNIQUE ;\n", "3 TETUE ; VOYOU ; MO YINGA MBI ?\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n", " ... \n", "477 chamarocka, kholo, nda kangue\n", "478 Amdjoudoul iwacki koudjoumass\n", "479 Bengue, arabo, pro français, pro russes, moutons,\n", "480 Gangster\n", "481 - Lawa Lawa\\n- Benguè\\n- Bandaï\n", "Name: terms_list, Length: 482, dtype: object" ] }, "metadata": { "tags": [] }, "execution_count": 25 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ot60Lmew6qp0", "outputId": "3735a9c3-5c4e-4489-caf0-f8ab54c950b2" }, "source": [ "!pip install contractions\n", "import contractions" ], "execution_count": 26, "outputs": [ { "output_type": "stream", "text": [ "Collecting contractions\n", " Downloading https://files.pythonhosted.org/packages/ce/ad/d1c685967945a04f8596128b15a1ab56c51488f53312e953341af6ff22d1/contractions-0.0.43-py2.py3-none-any.whl\n", "Collecting textsearch\n", " Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl\n", "Collecting pyahocorasick\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n", "\u001b[K |████████████████████████████████| 317kB 5.9MB/s \n", "\u001b[?25hCollecting Unidecode\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n", "\u001b[K |████████████████████████████████| 245kB 41.9MB/s \n", "\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n", " Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81707 sha256=0cb88880bcc215b7a3749858fd619d028c638f276938a7ffe08d22897d33c4d8\n", " Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n", "Successfully built pyahocorasick\n", "Installing collected packages: pyahocorasick, Unidecode, textsearch, contractions\n", "Successfully installed Unidecode-1.1.1 contractions-0.0.43 pyahocorasick-1.4.0 textsearch-0.0.17\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "yFq0P9PHFMFr" }, "source": [ "terms['no_contract'] = section2.apply(lambda x: [contractions.fix(word) for word in x.split()])" ], "execution_count": 27, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 276 }, "id": "v6Q_V3rWFoNA", "outputId": "0d3d3388-87ad-4d37-fd94-30a9137b94e4" }, "source": [ "terms.head()" ], "execution_count": 28, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3no_contract
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]
1rienrienNaNNaN[rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...
\n", "
" ], "text/plain": [ " terms_list ... no_contract\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]\n", "1 rien ... [rien]\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...\n", "\n", "[5 rows x 5 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 28 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 293 }, "id": "yCUL-zsLQ4-O", "outputId": "0dbc0192-3ead-40d9-f81d-00d575672d0a" }, "source": [ "terms[\"msg_str\"] = [' '.join(map(str, l)) for l in terms['no_contract']]\n", "terms.head()" ], "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3no_contractmsg_str
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,
1rienrienNaNNaN[rien]rien
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...
\n", "
" ], "text/plain": [ " terms_list ... msg_str\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n", "1 rien ... rien\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... HAINE ; RELIGION ; ETHNIQUE ;\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... TETUE ; VOYOU ; MO YINGA MBI ?\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n", "\n", "[5 rows x 6 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 36 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cEwARPq_GG08", "outputId": "f568dc5f-a4f2-44c4-9964-89c0e712fb8e" }, "source": [ "import nltk\n", "nltk.download('punkt')\n", "from nltk.tokenize import word_tokenize" ], "execution_count": 30, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "18uESl1iP1RL" }, "source": [ "text = \"Hi, I would like to tokenize this sentence\"" ], "execution_count": 31, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MF66YtCuP5YZ", "outputId": "72c38ef0-513a-435a-8950-80de388ea66c" }, "source": [ "print(word_tokenize(text))" ], "execution_count": 32, "outputs": [ { "output_type": "stream", "text": [ "['Hi', ',', 'I', 'would', 'like', 'to', 'tokenize', 'this', 'sentence']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "tAsC9yEvQNNt" }, "source": [ "terms['tokenized'] = terms['msg_str'].apply(word_tokenize)" ], "execution_count": 38, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 328 }, "id": "YbkjpCyiRRNt", "outputId": "782297b9-595b-4aba-bb89-ceea73fdc3ff" }, "source": [ "terms.head()" ], "execution_count": 39, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3no_contractmsg_strtokenized
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...
1rienrienNaNNaN[rien]rien[rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...
\n", "
" ], "text/plain": [ " terms_list ... tokenized\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...\n", "1 rien ... [rien]\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...\n", "\n", "[5 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 39 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 380 }, "id": "_w9FQIj9SrTG", "outputId": "dfef973c-bfcb-4c76-d891-c91e4c0c4bd7" }, "source": [ "terms['lower'] = terms['tokenized'].apply(lambda x: [word.lower() for word in x])\n", "terms.head()" ], "execution_count": 41, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3no_contractmsg_strtokenizedlower
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...
1rienrienNaNNaN[rien]rien[rien][rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;][haine, ;, religion, ;, ethnique, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?][tetue, ;, voyou, ;, mo, yinga, mbi, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...[les, gbakas, mandja, sont, trop, egoistes, ;,...
\n", "
" ], "text/plain": [ " terms_list ... lower\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...\n", "1 rien ... [rien]\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, ;, religion, ;, ethnique, ;]\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, ;, voyou, ;, mo, yinga, mbi, ?]\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, ;,...\n", "\n", "[5 rows x 8 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 41 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 380 }, "id": "GXCYOG-XTYJy", "outputId": "5936953c-3c9e-49a9-dcda-3418e1ce939c" }, "source": [ "import string\n", "punc = string.punctuation\n", "terms['no_punc'] = terms['lower'].apply(lambda x: [word for word in x if word not in punc])\n", "terms.head()" ], "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
terms_listterm_1term_2term_3no_contractmsg_strtokenizedlowerno_punc
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...[saranga, ti, wali, taxi, -moto, voyou]
1rienrienNaNNaN[rien]rien[rien][rien][rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;][haine, ;, religion, ;, ethnique, ;][haine, religion, ethnique]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?][tetue, ;, voyou, ;, mo, yinga, mbi, ?][tetue, voyou, mo, yinga, mbi]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...[les, gbakas, mandja, sont, trop, egoistes, ;,...[les, gbakas, mandja, sont, trop, egoistes, le...
\n", "
" ], "text/plain": [ " terms_list ... no_punc\n", "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, taxi, -moto, voyou]\n", "1 rien ... [rien]\n", "2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, religion, ethnique]\n", "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, voyou, mo, yinga, mbi]\n", "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, le...\n", "\n", "[5 rows x 9 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 42 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UYk5KcIZUJAZ", "outputId": "85b4298c-44e8-4714-a51b-fae2b2afecaf" }, "source": [ "terms.terms_list.str.split(expand=True).stack().value_counts()[:50]" ], "execution_count": 55, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "; 943\n", "TI 274\n", "MO 188\n", "A 174\n", "ZO 102\n", "SO 88\n", "ALA 78\n", "LA 77\n", "LO 69\n", "BA 49\n", "NA 46\n", "GA 44\n", "? 44\n", "MBI 38\n", "YEKE 35\n", ", 34\n", "BOUBA 33\n", "LES 31\n", "WALI 29\n", "APE 28\n", "TU 28\n", "I 26\n", "BENGUE 26\n", "MAMA 24\n", "AWE 22\n", "LAWA 20\n", "DE 19\n", "ARABO 19\n", "PINDOUNGOU 18\n", "DES 18\n", "RELIGION 17\n", "TOUADERA 17\n", "MBORORO 17\n", "ETHNIQUE 16\n", "GANGO 16\n", "ETRANGER 16\n", "YA 15\n", "LE 15\n", "KATA 15\n", "MBO 14\n", "GBAYA 14\n", "ME 13\n", "ES 13\n", "BANDA 13\n", "TA 13\n", "POLITIQUE 13\n", "INGA 13\n", "KE 13\n", "SELEKA 13\n", "ANDE 12\n", "dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 55 } ] } ] }