diff --git "a/fineweb_bias_eval.ipynb" "b/fineweb_bias_eval.ipynb"
new file mode 100644--- /dev/null
+++ "b/fineweb_bias_eval.ipynb"
@@ -0,0 +1,2705 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Load packages"
+ ],
+ "metadata": {
+ "id": "utSDkGUL101i"
+ },
+ "id": "utSDkGUL101i"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "34299990-bd58-4fe9-99fe-15d4b6796106",
+ "metadata": {
+ "id": "34299990-bd58-4fe9-99fe-15d4b6796106",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "e99e0fdc-27ee-4e6f-bc64-18f6127b9b3a"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.14.0)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n",
+ "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n",
+ "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.0.3)\n",
+ "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n",
+ "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
+ "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+ "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2024.3.1)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n",
+ "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.1)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.11.0)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.7)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n",
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+ "Requirement already satisfied: datatrove in /usr/local/lib/python3.10/dist-packages (0.2.0)\n",
+ "Requirement already satisfied: dill>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.3.8)\n",
+ "Requirement already satisfied: fsspec>=2023.12.2 in /usr/local/lib/python3.10/dist-packages (from datatrove) (2024.3.1)\n",
+ "Requirement already satisfied: huggingface-hub>=0.17.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.23.1)\n",
+ "Requirement already satisfied: humanize in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.7.0)\n",
+ "Requirement already satisfied: loguru>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.7.2)\n",
+ "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.70.16)\n",
+ "Requirement already satisfied: numpy>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (1.25.2)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.66.4)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (3.14.0)\n",
+ "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (24.0)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (6.0.1)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (2.31.0)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (4.11.0)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.7)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2.0.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2024.2.2)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install datasets\n",
+ "!pip install datatrove\n",
+ "import datasets\n",
+ "import json\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from datatrove.pipeline.readers import ParquetReader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "922a0454",
+ "metadata": {
+ "id": "922a0454",
+ "outputId": "8500a12a-6856-46ac-bb65-6f86db4bb001",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "The rich extension is already loaded. To reload it, use:\n",
+ " %reload_ext rich\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext rich"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "703c7781-0a33-41dc-8da9-2fa034483cad",
+ "metadata": {
+ "id": "703c7781-0a33-41dc-8da9-2fa034483cad"
+ },
+ "source": [
+ "## Methodology\n",
+ "\n",
+ "In order to measure bias in the dataset, we consider the following simple [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) based approach. The idea is that the specificity of a term -- in our case, how `biased` it is -- can be quantified as an inverse function of the number of documents in which it occurs.\n",
+ "\n",
+ "Given a dataset and terms for a subpopulation (gender) of interest:\n",
+ "1. Evaluate Inverse Document Frequencies on the full dataset\n",
+ "2. Compute the average TF-IDF vectors for the dataset for a given subpopulation (gender)\n",
+ "3. Sort the terms by variance to see words that are much more likely to appear specifically for a given subpopulation\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c837c65-987f-45cf-b18d-fc7836894372",
+ "metadata": {
+ "id": "7c837c65-987f-45cf-b18d-fc7836894372"
+ },
+ "source": [
+ "### Load Fineweb\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dbd19018",
+ "metadata": {
+ "id": "dbd19018",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "2852efb2-954f-460f-d143-18baa0408973"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "\u001b[32m2024-05-29 19:38:01.457\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdatatrove.pipeline.readers.base\u001b[0m:\u001b[36mread_files_shard\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mReading input file 000_00000.parquet\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "local = False\n",
+ "data_reader = ParquetReader(\"hf://datasets/HuggingFaceFW/fineweb/sample/10BT\")\n",
+ "all_docs = [document.text for document in data_reader()]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Compute frequencies"
+ ],
+ "metadata": {
+ "id": "eBj1TtiW2C-6"
+ },
+ "id": "eBj1TtiW2C-6"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Step 1: get Inverse document frequencies for the dataset\n",
+ "vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')\n",
+ "full_tfidf = vectorizer.fit_transform(all_docs)\n",
+ "tfidf_feature_names = np.array(vectorizer.get_feature_names_out())"
+ ],
+ "metadata": {
+ "id": "e_nQogiWceYZ"
+ },
+ "id": "e_nQogiWceYZ",
+ "execution_count": 50,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Bias analysis: Gender tf-idf"
+ ],
+ "metadata": {
+ "id": "aqIybwilj0KH"
+ },
+ "id": "aqIybwilj0KH"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Step 2: get average TF-IDF vectors **for each gender**\n",
+ "GENDER_PHRASES = [\"man\", \"woman\"]\n",
+ "tfidf_by_gender = {}\n",
+ "for phrase in GENDER_PHRASES:\n",
+ " gdr_docs = [doc for doc in all_docs if phrase in doc.split()]\n",
+ " if gdr_docs != []:\n",
+ " gdr_tfidf = np.asarray(vectorizer.transform(gdr_docs).mean(axis=0))[0]\n",
+ " tfidf_by_gender[phrase] = gdr_tfidf"
+ ],
+ "metadata": {
+ "id": "d-Na79jvczt0"
+ },
+ "id": "d-Na79jvczt0",
+ "execution_count": 51,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Step 3: for each term, compute the variance across genders\n",
+ "all_tfidf = np.array(list(tfidf_by_gender.values()))\n",
+ "tf_idf_var = all_tfidf - all_tfidf.sum(axis=0, keepdims=True)\n",
+ "tf_idf_var = np.power((tf_idf_var * tf_idf_var).sum(axis=0), 0.5)\n",
+ "sort_by_variance = tf_idf_var.argsort()[::-1]"
+ ],
+ "metadata": {
+ "id": "D0sbbLyWw2CZ"
+ },
+ "id": "D0sbbLyWw2CZ",
+ "execution_count": 52,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "03393fe5-2a92-451a-bd08-6a27a6239097",
+ "metadata": {
+ "id": "03393fe5-2a92-451a-bd08-6a27a6239097"
+ },
+ "outputs": [],
+ "source": [
+ "# Create the data structure for the visualization,\n",
+ "# showing the highest variance words for each gender,\n",
+ "# and how they deviate from the mean\n",
+ "pre_pandas_lines = [\n",
+ " {\n",
+ " \"word\": tfidf_feature_names[w],\n",
+ " \"man\": all_tfidf[0, w],\n",
+ " \"woman\": all_tfidf[1, w],\n",
+ " \"man+\": all_tfidf[0, w] - all_tfidf[:, w].mean(),\n",
+ " \"woman+\": all_tfidf[1, w] - all_tfidf[:, w].mean(),\n",
+ " \"variance\": tf_idf_var[w],\n",
+ " \"total\": all_tfidf[:, w].sum(),\n",
+ " }\n",
+ " for w in sort_by_variance[:50]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Results"
+ ],
+ "metadata": {
+ "id": "IhJC-iT91smy"
+ },
+ "id": "IhJC-iT91smy"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Plot\n",
+ "df = pd.DataFrame.from_dict(pre_pandas_lines)\n",
+ "df.style.background_gradient(\n",
+ " axis=None,\n",
+ " vmin=0,\n",
+ " vmax=0.2,\n",
+ " cmap=\"YlGnBu\"\n",
+ ").format(precision=2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "LDLjFa6HdMWe",
+ "outputId": "d012172a-4c03-4505-83c6-7bd6c3c77a91"
+ },
+ "id": "LDLjFa6HdMWe",
+ "execution_count": 47,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [],
+ "text/html": [
+ "
\n"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eb340\u001b[0m\u001b[1m>\u001b[0m"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " word | \n",
+ " man | \n",
+ " woman | \n",
+ " man+ | \n",
+ " woman+ | \n",
+ " variance | \n",
+ " total | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " woman | \n",
+ " 0.01 | \n",
+ " 0.07 | \n",
+ " -0.03 | \n",
+ " 0.03 | \n",
+ " 0.07 | \n",
+ " 0.08 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " man | \n",
+ " 0.05 | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " women | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " 0.06 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " said | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " people | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " tsa | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " life | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " just | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " police | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " god | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " like | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " cancer | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " marriage | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " time | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " mouse | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " rudy | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " gangnam | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " medical | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " world | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " work | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " make | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " think | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " palin | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " john | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " surgery | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " anderson | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " day | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " gregory | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " st | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " hermit | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " says | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " know | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " use | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " plus | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " size | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " year | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " don | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " died | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " left | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " did | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " white | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " right | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " wife | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " sir | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " way | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " great | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " city | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " korean | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " camera | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " place | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 47
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e273abff-3d81-431f-9188-82d87d1ecda2",
+ "metadata": {
+ "id": "e273abff-3d81-431f-9188-82d87d1ecda2"
+ },
+ "source": [
+ "#### Sorting by bias\n",
+ "\n",
+ "In order to better surface biases, we can sort the table by how much one gender over-represents a term.\n",
+ "\n",
+ "In this case, we see that instances mentioning `man` are more likely to include `god` than those mentioning `woman`, which in turn are more likely to include `cancer`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "34229f06-5bf7-4ece-b43e-7d453931abd4",
+ "metadata": {
+ "id": "34229f06-5bf7-4ece-b43e-7d453931abd4",
+ "outputId": "7720b46d-a37d-4007-aa8e-8d7973f4f91c",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "collapsed": true
+ },
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [],
+ "text/html": [
+ "\n"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eac20\u001b[0m\u001b[1m>\u001b[0m"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " word | \n",
+ " man | \n",
+ " woman | \n",
+ " man+ | \n",
+ " woman+ | \n",
+ " variance | \n",
+ " total | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " man | \n",
+ " 0.05 | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " said | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " life | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " god | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " just | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " like | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " way | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " think | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " place | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " right | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " time | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " year | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " know | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " make | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " people | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " police | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " day | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " says | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " great | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " city | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " did | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " don | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " st | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " left | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " john | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " world | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " korean | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " sir | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " marriage | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " work | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " hermit | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " gregory | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " palin | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " wife | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " gangnam | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " rudy | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " use | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " died | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " tsa | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " white | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " size | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " camera | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " plus | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " medical | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " surgery | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " anderson | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " mouse | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " women | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " 0.06 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " cancer | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " woman | \n",
+ " 0.01 | \n",
+ " 0.07 | \n",
+ " -0.03 | \n",
+ " 0.03 | \n",
+ " 0.07 | \n",
+ " 0.08 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 45
+ }
+ ],
+ "source": [
+ "df.sort_values('man+', ascending=False).style.background_gradient(\n",
+ " axis=None,\n",
+ " vmin=0,\n",
+ " vmax=0.2,\n",
+ " cmap=\"YlGnBu\"\n",
+ ").format(precision=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.sort_values('woman+', ascending=False).style.background_gradient(\n",
+ " axis=None,\n",
+ " vmin=0,\n",
+ " vmax=0.2,\n",
+ " cmap=\"YlGnBu\"\n",
+ ").format(precision=2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "ufATwOCojOdv",
+ "outputId": "299fdb81-a754-4afe-b0fd-5be8aac8c549"
+ },
+ "id": "ufATwOCojOdv",
+ "execution_count": 46,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [],
+ "text/html": [
+ "\n"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eab60\u001b[0m\u001b[1m>\u001b[0m"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " word | \n",
+ " man | \n",
+ " woman | \n",
+ " man+ | \n",
+ " woman+ | \n",
+ " variance | \n",
+ " total | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " woman | \n",
+ " 0.01 | \n",
+ " 0.07 | \n",
+ " -0.03 | \n",
+ " 0.03 | \n",
+ " 0.07 | \n",
+ " 0.08 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " cancer | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " women | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.04 | \n",
+ " 0.06 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " mouse | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " anderson | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " surgery | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " medical | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " plus | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " camera | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " size | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " white | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " tsa | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " died | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " use | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " rudy | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " gangnam | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " wife | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " palin | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " gregory | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " hermit | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " work | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " marriage | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " sir | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " korean | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " world | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " john | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " left | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " st | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " don | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " did | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " city | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " great | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " says | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " day | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " police | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " people | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " make | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " know | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " year | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " time | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " right | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " place | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " think | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " way | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " like | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " just | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " god | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " -0.00 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " life | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " said | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.03 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " man | \n",
+ " 0.05 | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " -0.01 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 46
+ }
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.1"
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file