{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from fastai.vision.all import *\n", "import gradio as gr\n", "import requests\n", "import base64\n", "from bs4 import BeautifulSoup\n", "import os" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Load the trained model\n", "learn = load_learner('nsfw_model.pkl')\n", "labels = learn.dls.vocab\n", "\n", "def analyze(url):\n", " \"\"\"Analyzer function that classifies the images found at the given URL\"\"\"\n", " \n", " # Make sure URL starts with http or https\n", " # TODO: confirm that the url points to a web page, and not some resource.\n", " # Regex could be useful here\n", " if not url.startswith(('http://','https://')):\n", " url = 'http://'+url\n", " \n", " safety = 'safe' # our return variable\n", "\n", " # Extract html and all img tags\n", " html = requests.get(url)\n", " soup = BeautifulSoup(html.text, \"html.parser\")\n", " img_elements = soup.find_all(\"img\")\n", "\n", " # Save all src urls that we can clearly tell are img urls.\n", " # A better approach would be to use regex here\n", " srcs = []\n", " for img in img_elements:\n", " for v in img.attrs.values():\n", " if isinstance(v, str):\n", " if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):\n", " srcs.append(v)\n", " \n", " # Get the images from the urls and classify\n", " # If there is a single unsafe image, report it.\n", " for src_url in srcs:\n", " try:\n", " img_data = requests.get(src_url).content\n", " temp = 'temp.' + src_url.lower().split('.')[-1]\n", " with open(temp, 'wb') as handler:\n", " handler.write(img_data)\n", " is_nsfw,_,probs = learn.predict(PILImage.create(temp))\n", " os.remove(temp) \n", " if is_nsfw == \"unsafe_searches\":\n", " safety = 'NOT safe'\n", " return safety\n", " except Exception as e:\n", " pass\n", " return safety" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7867\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(, 'http://127.0.0.1:7867/', None)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "title = \"Website Safety Analyzer\"\n", "description = \"**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate. \\n\"+\\\n", " \"This is step one in an attempt to solve that. An image classifier that audits every image at a URL. \\n\"+\\\n", " \"In this iteration, I classify sites with sexually explicit content as **'NOT safe'**. \\n\\n\"+\\\n", " \"There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc. \\n\"+\\\n", " \"Another step will be to convert this into a browser extension/add-on. \\n\"+\\\n", " \"I welcome any help on this. 🙂\"\n", "examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']\n", "enable_queue=True\n", "\n", "iface = gr.Interface(\n", " fn=analyze, \n", " inputs=\"text\", \n", " outputs=\"text\",\n", " title=title,\n", " description=description,\n", " examples=examples,\n", ")\n", "iface.launch(enable_queue=enable_queue)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "vscode": { "interpreter": { "hash": "ed0e91aaffcefde6eb9bcd4f55fe7652d77471dc031ce772257aa5eb4a54e8f2" } } }, "nbformat": 4, "nbformat_minor": 2 }