Kwadwo Agyapon-Ntra commited on
Commit
28ff501
β€’
1 Parent(s): 983a157
Files changed (5) hide show
  1. README.md +2 -2
  2. app.py +71 -0
  3. nsfw_model.pkl +3 -0
  4. requirements.txt +2 -0
  5. test.ipynb +162 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: WebsiteImageSafetyAnalyzer
3
- emoji: πŸ“ˆ
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
 
1
  ---
2
+ title: Website Image Safety Analyzer
3
+ emoji: 🧐
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastai.vision.all import *
2
+ import gradio as gr
3
+ import requests
4
+ import base64
5
+ from bs4 import BeautifulSoup
6
+ import os
7
+
8
+ # Load the trained model
9
+ learn = load_learner('nsfw_model.pkl')
10
+ labels = learn.dls.vocab
11
+
12
+ def analyze(url):
13
+ """Analyzer function that classifies the images found at the given URL"""
14
+
15
+ # Make sure URL starts with http or https
16
+ # TODO: confirm that the url points to a web page, and not some resource.
17
+ # Regex could be useful here
18
+ if not url.startswith(('http://','https://')):
19
+ url = 'http://'+url
20
+
21
+ safety = 'safe' # our return variable
22
+
23
+ # Extract html and all img tags
24
+ html = requests.get(url)
25
+ soup = BeautifulSoup(html.text, "html.parser")
26
+ img_elements = soup.find_all("img")
27
+
28
+ # Save all src urls that we can clearly tell are img urls.
29
+ # A better approach would be to use regex here
30
+ srcs = []
31
+ for img in img_elements:
32
+ for v in img.attrs.values():
33
+ if isinstance(v, str):
34
+ if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):
35
+ srcs.append(v)
36
+
37
+ # Get the images from the urls and classify
38
+ # If there is a single unsafe image, report it.
39
+ for src_url in srcs:
40
+ try:
41
+ img_data = requests.get(src_url).content
42
+ temp = 'temp.' + src_url.lower().split('.')[-1]
43
+ with open(temp, 'wb') as handler:
44
+ handler.write(img_data)
45
+ is_nsfw,_,probs = learn.predict(PILImage.create(temp))
46
+ os.remove(temp)
47
+ if is_nsfw == "unsafe_searches":
48
+ safety = 'NOT safe'
49
+ return safety
50
+ except Exception as e:
51
+ pass
52
+ return safety
53
+
54
+ title = "Website Safety Analyzer"
55
+ description = "**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate. \n"+\
56
+ "This is step one in an attempt to solve that. An image classifier that audits every image at a URL. \n"+\
57
+ "In this iteration, I classify sites with sexually explicit content as **'NOT safe'**. \n\n"+\
58
+ "There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc. \n"+\
59
+ "I welcome any help on this. πŸ™‚"
60
+ examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']
61
+ enable_queue=True
62
+
63
+ iface = gr.Interface(
64
+ fn=analyze,
65
+ inputs="text",
66
+ outputs="text",
67
+ title=title,
68
+ description=description,
69
+ examples=examples,
70
+ )
71
+ iface.launch(enable_queue=enable_queue)
nsfw_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018578406ed833284ff69a8198f71c4c71ce537afb0861a602f2240bd3cb3110
3
+ size 46972399
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastai
2
+ beautifulsoup4
test.ipynb ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from fastai.vision.all import *\n",
10
+ "import gradio as gr\n",
11
+ "import requests\n",
12
+ "import base64\n",
13
+ "from bs4 import BeautifulSoup\n",
14
+ "import os"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 7,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "# Load the trained model\n",
24
+ "learn = load_learner('nsfw_model.pkl')\n",
25
+ "labels = learn.dls.vocab\n",
26
+ "\n",
27
+ "def analyze(url):\n",
28
+ " \"\"\"Analyzer function that classifies the images found at the given URL\"\"\"\n",
29
+ " \n",
30
+ " # Make sure URL starts with http or https\n",
31
+ " # TODO: confirm that the url points to a web page, and not some resource.\n",
32
+ " # Regex could be useful here\n",
33
+ " if not url.startswith(('http://','https://')):\n",
34
+ " url = 'http://'+url\n",
35
+ " \n",
36
+ " safety = 'safe' # our return variable\n",
37
+ "\n",
38
+ " # Extract html and all img tags\n",
39
+ " html = requests.get(url)\n",
40
+ " soup = BeautifulSoup(html.text, \"html.parser\")\n",
41
+ " img_elements = soup.find_all(\"img\")\n",
42
+ "\n",
43
+ " # Save all src urls that we can clearly tell are img urls.\n",
44
+ " # A better approach would be to use regex here\n",
45
+ " srcs = []\n",
46
+ " for img in img_elements:\n",
47
+ " for v in img.attrs.values():\n",
48
+ " if isinstance(v, str):\n",
49
+ " if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):\n",
50
+ " srcs.append(v)\n",
51
+ " \n",
52
+ " # Get the images from the urls and classify\n",
53
+ " # If there is a single unsafe image, report it.\n",
54
+ " for src_url in srcs:\n",
55
+ " try:\n",
56
+ " img_data = requests.get(src_url).content\n",
57
+ " temp = 'temp.' + src_url.lower().split('.')[-1]\n",
58
+ " with open(temp, 'wb') as handler:\n",
59
+ " handler.write(img_data)\n",
60
+ " is_nsfw,_,probs = learn.predict(PILImage.create(temp))\n",
61
+ " os.remove(temp) \n",
62
+ " if is_nsfw == \"unsafe_searches\":\n",
63
+ " safety = 'NOT safe'\n",
64
+ " return safety\n",
65
+ " except Exception as e:\n",
66
+ " pass\n",
67
+ " return safety"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 11,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "Running on local URL: http://127.0.0.1:7867\n",
80
+ "\n",
81
+ "To create a public link, set `share=True` in `launch()`.\n"
82
+ ]
83
+ },
84
+ {
85
+ "data": {
86
+ "text/html": [
87
+ "<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
88
+ ],
89
+ "text/plain": [
90
+ "<IPython.core.display.HTML object>"
91
+ ]
92
+ },
93
+ "metadata": {},
94
+ "output_type": "display_data"
95
+ },
96
+ {
97
+ "data": {
98
+ "text/plain": [
99
+ "(<gradio.routes.App at 0x7f0da61cb1f0>, 'http://127.0.0.1:7867/', None)"
100
+ ]
101
+ },
102
+ "execution_count": 11,
103
+ "metadata": {},
104
+ "output_type": "execute_result"
105
+ }
106
+ ],
107
+ "source": [
108
+ "title = \"Website Safety Analyzer\"\n",
109
+ "description = \"**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate. \\n\"+\\\n",
110
+ " \"This is step one in an attempt to solve that. An image classifier that audits every image at a URL. \\n\"+\\\n",
111
+ " \"In this iteration, I classify sites with sexually explicit content as **'NOT safe'**. \\n\\n\"+\\\n",
112
+ " \"There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc. \\n\"+\\\n",
113
+ " \"I welcome any help on this. πŸ™‚\"\n",
114
+ "examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']\n",
115
+ "enable_queue=True\n",
116
+ "\n",
117
+ "iface = gr.Interface(\n",
118
+ " fn=analyze, \n",
119
+ " inputs=\"text\", \n",
120
+ " outputs=\"text\",\n",
121
+ " title=title,\n",
122
+ " description=description,\n",
123
+ " examples=examples,\n",
124
+ ")\n",
125
+ "iface.launch(enable_queue=enable_queue)"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": []
134
+ }
135
+ ],
136
+ "metadata": {
137
+ "kernelspec": {
138
+ "display_name": "Python 3 (ipykernel)",
139
+ "language": "python",
140
+ "name": "python3"
141
+ },
142
+ "language_info": {
143
+ "codemirror_mode": {
144
+ "name": "ipython",
145
+ "version": 3
146
+ },
147
+ "file_extension": ".py",
148
+ "mimetype": "text/x-python",
149
+ "name": "python",
150
+ "nbconvert_exporter": "python",
151
+ "pygments_lexer": "ipython3",
152
+ "version": "3.10.6"
153
+ },
154
+ "vscode": {
155
+ "interpreter": {
156
+ "hash": "ed0e91aaffcefde6eb9bcd4f55fe7652d77471dc031ce772257aa5eb4a54e8f2"
157
+ }
158
+ }
159
+ },
160
+ "nbformat": 4,
161
+ "nbformat_minor": 2
162
+ }