{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.vision.all import *\n",
    "import gradio as gr\n",
    "import requests\n",
    "import base64\n",
    "from bs4 import BeautifulSoup\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the trained model\n",
    "learn = load_learner('nsfw_model.pkl')\n",
    "labels = learn.dls.vocab\n",
    "\n",
    "def analyze(url):\n",
    "    \"\"\"Analyzer function that classifies the images found at the given URL\"\"\"\n",
    "    \n",
    "    # Make sure URL starts with http or https\n",
    "    # TODO: confirm that the url points to a web page, and not some resource.\n",
    "    # Regex could be useful here\n",
    "    if not url.startswith(('http://','https://')):\n",
    "        url = 'http://'+url\n",
    "    \n",
    "    safety = 'safe' # our return variable\n",
    "\n",
    "    # Extract html and all img tags\n",
    "    html = requests.get(url)\n",
    "    soup = BeautifulSoup(html.text, \"html.parser\")\n",
    "    img_elements = soup.find_all(\"img\")\n",
    "\n",
    "    # Save all src urls that we can clearly tell are img urls.\n",
    "    # A better approach would be to use regex here\n",
    "    srcs = []\n",
    "    for img in img_elements:\n",
    "        for v in img.attrs.values():\n",
    "            if isinstance(v, str):\n",
    "                if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):\n",
    "                    srcs.append(v)\n",
    "    \n",
    "    # Get the images from the urls and classify\n",
    "    # If there is a single unsafe image, report it.\n",
    "    for src_url in srcs:\n",
    "        try:\n",
    "            img_data = requests.get(src_url).content\n",
    "            temp = 'temp.' + src_url.lower().split('.')[-1]\n",
    "            with open(temp, 'wb') as handler:\n",
    "                handler.write(img_data)\n",
    "            is_nsfw,_,probs = learn.predict(PILImage.create(temp))\n",
    "            os.remove(temp) \n",
    "            if is_nsfw == \"unsafe_searches\":\n",
    "                safety = 'NOT safe'\n",
    "                return safety\n",
    "        except Exception as e:\n",
    "            pass\n",
    "    return safety"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7867\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(<gradio.routes.App at 0x7f0da61cb1f0>, 'http://127.0.0.1:7867/', None)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title = \"Website Safety Analyzer\"\n",
    "description = \"**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate.  \\n\"+\\\n",
    "                \"This is step one in an attempt to solve that. An image classifier that audits every image at a URL.  \\n\"+\\\n",
    "                \"In this iteration, I classify sites with sexually explicit content as **'NOT safe'**.  \\n\\n\"+\\\n",
    "                \"There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc.  \\n\"+\\\n",
    "                \"Another step will be to convert this into a browser extension/add-on.  \\n\"+\\\n",
    "                \"I welcome any help on this. 🙂\"\n",
    "examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']\n",
    "enable_queue=True\n",
    "\n",
    "iface = gr.Interface(\n",
    "    fn=analyze, \n",
    "    inputs=\"text\", \n",
    "    outputs=\"text\",\n",
    "    title=title,\n",
    "    description=description,\n",
    "    examples=examples,\n",
    ")\n",
    "iface.launch(enable_queue=enable_queue)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
    "hash": "ed0e91aaffcefde6eb9bcd4f55fe7652d77471dc031ce772257aa5eb4a54e8f2"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}