Spaces:

FaceOnLive
/

Face-Liveness-Detection-SDK

Running

App Files Files Community

Zhu-FaceOnLive commited on Oct 26, 2023

Commit

2ded60b

•

1 Parent(s): 1accf5d

Initial commit.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
Dockerfile +19 -0
app.py +115 -0
facewrapper/dict/data1.bin +3 -0
facewrapper/dict/data2.bin +3 -0
facewrapper/dict/data3.bin +3 -0
facewrapper/facewrapper.py +31 -0
facewrapper/libs/libimutils.so +0 -0
facewrapper/libs/libimutils.so_for_ubuntu22 +0 -0
facewrapper/libs/libttvfaceengine7.so +3 -0
gradio/demo.py +32 -0
gradio/examples/1.jpg +0 -0
gradio/examples/2.jpg +0 -0
gradio/examples/3.jpg +0 -0
gradio/examples/4.jpg +0 -0
openvino/cache.json +0 -0
openvino/libgna.so +3 -0
openvino/libgna.so.2 +3 -0
openvino/libgna.so.3.0.0.1455 +3 -0
openvino/libopenvino.so +3 -0
openvino/libopenvino_auto_batch_plugin.so +0 -0
openvino/libopenvino_auto_plugin.so +0 -0
openvino/libopenvino_c.so +0 -0
openvino/libopenvino_gapi_preproc.so +3 -0
openvino/libopenvino_hetero_plugin.so +0 -0
openvino/libopenvino_intel_cpu_plugin.so +3 -0
openvino/libopenvino_intel_gna_plugin.so +3 -0
openvino/libopenvino_intel_hddl_plugin.so +3 -0
openvino/libopenvino_intel_myriad_plugin.so +3 -0
openvino/libopenvino_ir_frontend.so +0 -0
openvino/libopenvino_onnx_frontend.so +3 -0
openvino/libopenvino_paddle_frontend.so +0 -0
openvino/libopenvino_tensorflow_fe.so +3 -0
openvino/pcie-ma2x8x.mvcmd +3 -0
openvino/plugins.xml +27 -0
openvino/usb-ma2x8x.mvcmd +3 -0
openvino/vpu_custom_kernels/binarization.bin +3 -0
openvino/vpu_custom_kernels/binarization.cl +67 -0
openvino/vpu_custom_kernels/binary_convolution.bin +3 -0
openvino/vpu_custom_kernels/binary_convolution.cl +95 -0
openvino/vpu_custom_kernels/binary_convolution1x1.bin +3 -0
openvino/vpu_custom_kernels/binary_convolution1x1.cl +117 -0
openvino/vpu_custom_kernels/binary_convolution3x3.bin +3 -0
openvino/vpu_custom_kernels/binary_convolution3x3.cl +278 -0
openvino/vpu_custom_kernels/convolution1x1_chw.bin +3 -0
openvino/vpu_custom_kernels/convolution1x1_chw.cl +114 -0
openvino/vpu_custom_kernels/convolution1x1_hwc.bin +3 -0
openvino/vpu_custom_kernels/convolution1x1_hwc.cl +126 -0
openvino/vpu_custom_kernels/convolution3x3.bin +3 -0
openvino/vpu_custom_kernels/convolution3x3.cl +158 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+facewrapper/libs/libttvfaceengine7.so filter=lfs diff=lfs merge=lfs -text
+openvino/libgna.so filter=lfs diff=lfs merge=lfs -text
+openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text
+openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text
+openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text
+openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
+openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM ubuntu:20.04
+RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
+RUN apt-get update -y
+RUN apt-get install -y python3 python3-pip python3-opencv
+RUN apt-get install -y libcurl4-openssl-dev libssl-dev
+RUN mkdir -p /home/FaceOnLive_v7
+RUN mkdir -p /home/FaceOnLive_v7/facewrapper
+WORKDIR /home/FaceOnLive_v7
+COPY ./facewrapper ./facewrapper
+COPY ./facewrapper/libs/libimutils.so /usr/lib
+COPY ./gradio ./gradio
+COPY ./openvino /usr/lib
+COPY ./app.py ./app.py
+COPY ./run.sh .
+COPY ./requirements.txt ./requirements.txt
+RUN pip3 install -r requirements.txt
+RUN chmod a+x run.sh
+CMD ["./run.sh"]
+EXPOSE 9000

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import sys
+sys.path.append('.')
+from flask import Flask, request, jsonify
+from time import gmtime, strftime
+import os
+import base64
+import json
+import cv2
+import numpy as np
+from facewrapper.facewrapper import ttv_version
+from facewrapper.facewrapper import ttv_get_hwid
+from facewrapper.facewrapper import ttv_init
+from facewrapper.facewrapper import ttv_init_offline
+from facewrapper.facewrapper import ttv_detect_face
+app = Flask(__name__)
+app.config['SITE'] = "http://0.0.0.0:8000/"
+app.config['DEBUG'] = False
+licenseKey = os.environ.get("LICENSE_KEY")
+licensePath = "license.txt"
+modelFolder = os.path.abspath(os.path.dirname(__file__)) + '/facewrapper/dict'
+version = ttv_version()
+print("version: ", version.decode('utf-8'))
+ret = ttv_init(modelFolder.encode('utf-8'), licenseKey.encode('utf-8'))
+if ret != 0:
+    print(f"online init failed: {ret}");
+    hwid = ttv_get_hwid()
+    print("hwid: ", hwid.decode('utf-8'))
+    ret = ttv_init_offline(modelFolder.encode('utf-8'), licensePath.encode('utf-8'))
+    if ret != 0:
+        print(f"offline init failed: {ret}")
+        exit(-1)
+    else:
+        print(f"offline init ok")
+else:
+    print(f"online init ok")
+@app.route('/api/liveness', methods=['POST'])
+def check_liveness():
+  file = request.files['image']
+  image = cv2.imdecode(np.fromstring(file.read(), np.uint8), cv2.IMREAD_COLOR)
+  faceRect = np.zeros([4], dtype=np.int32)
+  livenessScore = np.zeros([1], dtype=np.double)
+  angles = np.zeros([3], dtype=np.double)
+  ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
+  if ret == -1:
+      result = "license error!"
+  elif ret == -2:
+      result = "init error!"
+  elif ret == 0:
+      result = "no face detected!"
+  elif ret > 1:
+      result = "multiple face detected!"
+  elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
+      result = "faace is in boundary!"
+  elif livenessScore[0] > 0.5:
+      result = "genuine"
+  else:
+      result = "spoof"
+  status = "ok"
+  response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
+    "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
+  response.status_code = 200
+  response.headers["Content-Type"] = "application/json; charset=utf-8"
+  return response
+@app.route('/api/liveness_base64', methods=['POST'])
+def check_liveness_base64():
+  content = request.get_json()
+  imageBase64 = content['image']
+  image = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase64), dtype=np.uint8), cv2.IMREAD_COLOR)
+  faceRect = np.zeros([4], dtype=np.int32)
+  livenessScore = np.zeros([1], dtype=np.double)
+  angles = np.zeros([3], dtype=np.double)
+  ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
+  if ret == -1:
+      result = "license error!"
+  elif ret == -2:
+      result = "init error!"
+  elif ret == 0:
+      result = "no face detected!"
+  elif ret > 1:
+      result = "multiple face detected!"
+  elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
+      result = "faace is in boundary!"
+  elif livenessScore[0] > 0.5:
+      result = "genuine"
+  else:
+      result = "spoof"
+  status = "ok"
+  response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
+    "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
+  response.status_code = 200
+  response.headers["Content-Type"] = "application/json; charset=utf-8"
+  return response
+if __name__ == '__main__':
+    port = int(os.environ.get("PORT", 8000))
+    app.run(host='0.0.0.0', port=port)

facewrapper/dict/data1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36cf5fcc49345989a86839a53529314ec1fe5d621c377a1952bc7538d55e7f1b
+size 16255630

facewrapper/dict/data2.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa65c4b7df14f0c96c174868a1b1c675adc8c4a11e3c0807009f3d0cad51f5a
+size 280076956

facewrapper/dict/data3.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f25fb0cd3d70cb84c258e7109620f411c087e0875828d6ab86cc9c4838d49bec
+size 11875339

facewrapper/facewrapper.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import ctypes, ctypes.util
+from ctypes import *
+from numpy.ctypeslib import ndpointer
+import sys
+import os
+sys.path.append('/opt/intel/openvino_2022/runtime/lib/intel64')
+lib_path = os.path.abspath(os.path.dirname(__file__)) + '/libs/libttvfaceengine7.so'
+liveness_engine = cdll.LoadLibrary(lib_path)
+ttv_version = liveness_engine.ttv_version
+ttv_version.argtypes = []
+ttv_version.restype = ctypes.c_char_p
+ttv_get_hwid = liveness_engine.ttv_get_hwid
+ttv_get_hwid.argtypes = []
+ttv_get_hwid.restype = ctypes.c_char_p
+ttv_init = liveness_engine.ttv_init
+ttv_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
+ttv_init.restype = ctypes.c_int32
+ttv_init_offline = liveness_engine.ttv_init_offline
+ttv_init_offline.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
+ttv_init_offline.restype = ctypes.c_int32
+ttv_detect_face = liveness_engine.ttv_detect_face
+ttv_detect_face.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ctypes.c_int32, ctypes.c_int32, ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS')]
+ttv_detect_face.restype = ctypes.c_int32

facewrapper/libs/libimutils.so ADDED Viewed

Binary file (412 kB). View file

facewrapper/libs/libimutils.so_for_ubuntu22 ADDED Viewed

Binary file (412 kB). View file

facewrapper/libs/libttvfaceengine7.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d6f12326c8bd60242dd7366cfebeef69d25a296bdd9d329d3033e8b70e782f
+size 3664979

gradio/demo.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+import requests
+import json
+def face_liveness(frame):
+    url = "http://127.0.0.1:8000/api/liveness"
+    files = None
+    if frame is None:
+        return ['', None]
+    files = {'image': open(frame, 'rb')}
+    r = requests.post(url=url, files=files)
+    return r.json()
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+    # Face Liveness Detection
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=5):
+            image_input = gr.Image(type='filepath')
+            gr.Examples(['gradio/examples/1.jpg', 'gradio/examples/2.jpg', 'gradio/examples/3.jpg', 'gradio/examples/4.jpg'],
+                            inputs=image_input)
+            face_liveness_button = gr.Button("Check Liveness")
+        with gr.Column(scale=5):
+            liveness_result_output = gr.JSON()
+    face_liveness_button.click(face_liveness, inputs=image_input, outputs=liveness_result_output)
+demo.launch(server_name="0.0.0.0", server_port=7860)

gradio/examples/1.jpg ADDED Viewed

gradio/examples/2.jpg ADDED Viewed

gradio/examples/3.jpg ADDED Viewed

gradio/examples/4.jpg ADDED Viewed

openvino/cache.json ADDED Viewed

The diff for this file is too large to render. See raw diff

openvino/libgna.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
+size 3120536

openvino/libgna.so.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
+size 3120536

openvino/libgna.so.3.0.0.1455 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
+size 3120536

openvino/libopenvino.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
+size 13495416

openvino/libopenvino_auto_batch_plugin.so ADDED Viewed

Binary file (391 kB). View file

openvino/libopenvino_auto_plugin.so ADDED Viewed

Binary file (371 kB). View file

openvino/libopenvino_c.so ADDED Viewed

Binary file (305 kB). View file

openvino/libopenvino_gapi_preproc.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
+size 1312920

openvino/libopenvino_hetero_plugin.so ADDED Viewed

Binary file (367 kB). View file

openvino/libopenvino_intel_cpu_plugin.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
+size 33299880

openvino/libopenvino_intel_gna_plugin.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
+size 4067016

openvino/libopenvino_intel_hddl_plugin.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
+size 5894680

openvino/libopenvino_intel_myriad_plugin.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
+size 6120168

openvino/libopenvino_ir_frontend.so ADDED Viewed

Binary file (343 kB). View file

openvino/libopenvino_onnx_frontend.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
+size 3781640

openvino/libopenvino_paddle_frontend.so ADDED Viewed

Binary file (987 kB). View file

openvino/libopenvino_tensorflow_fe.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
+size 2723864

openvino/pcie-ma2x8x.mvcmd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
+size 2099248

openvino/plugins.xml ADDED Viewed

	@@ -0,0 +1,27 @@

+<ie>
+    <plugins>
+        <plugin name="AUTO" location="libopenvino_auto_plugin.so">
+            <properties>
+                <property key="MULTI_WORK_MODE_AS_AUTO" value="YES"/>
+            </properties>
+        </plugin>
+        <plugin name="BATCH" location="libopenvino_auto_batch_plugin.so">
+        </plugin>
+        <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
+        </plugin>
+        <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
+        </plugin>
+        <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
+        </plugin>
+        <plugin name="HETERO" location="libopenvino_hetero_plugin.so">
+        </plugin>
+        <plugin name="MULTI" location="libopenvino_auto_plugin.so">
+        </plugin>
+        <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
+        </plugin>
+        <plugin name="HDDL" location="libopenvino_intel_hddl_plugin.so">
+        </plugin>
+        <plugin name="VPUX" location="libopenvino_intel_vpux_plugin.so">
+        </plugin>
+    </plugins>
+</ie>

openvino/usb-ma2x8x.mvcmd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
+size 2298632

openvino/vpu_custom_kernels/binarization.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
+size 19632

openvino/vpu_custom_kernels/binarization.cl ADDED Viewed

	@@ -0,0 +1,67 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+__kernel void binarization(
+    const __global half *__restrict src_data,
+    const __global half *__restrict input_low_high,
+    const __global half *__restrict dst_data,
+    int switch_out,
+    int input_low_high_size,
+    int W,
+    int H)
+{
+    __local half local_src[15 * 1024];
+    __local half local_dst[15 * 1024];
+    event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
+    wait_group_events(1, &e1);
+    int c = get_global_id(2);
+    int C = get_global_size(2);
+    half dst_low  = switch_out ? 1.h : -1.h;
+    half dst_high = switch_out ? -1.h : 1.h;
+    half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
+    for (int h = 0; h < H; h++) {
+        __local const half *__restrict addr_src = local_src + h * W;
+        __local half *__restrict addr_dst       = local_dst + h * W;
+#if 1
+        for (int w = 0; w < W / 8; w++) {
+            half8 h_src_val8 = (*((__local half8 *)addr_src + w));
+            short8 cond1;
+            cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
+            cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
+            cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
+            cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
+            cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
+            cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
+            cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
+            cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
+            cond1 = ~(cond1 - (short8)1);
+            short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
+            *((__local half8 *)addr_dst + w) = as_half8(res);
+        }
+#endif
+        for (int w = W & (~0x7); w < W; w++) {
+            addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
+    wait_group_events(1, &e2);
+}

openvino/vpu_custom_kernels/binary_convolution.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
+size 10764

openvino/vpu_custom_kernels/binary_convolution.cl ADDED Viewed

	@@ -0,0 +1,95 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
+__kernel void binary_convolution(
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    __global half *restrict dst_data,
+    float pad_value,
+    int IW,
+    int IH,
+    int IC,
+    int DW,
+    int DH,
+    int GC,
+    int KW,
+    int KH,
+    int PW,
+    int PH,
+    int SW,
+    int SH)
+{
+    int ipad_value = ((pad_value > 0.f) ? 1 : 0);
+    int c          = get_global_id(2);
+    int y          = get_global_id(1);
+    int x          = get_global_id(0);
+    int OC = get_global_size(2);
+    int OH = get_global_size(1);
+    int OW = get_global_size(0);
+    int KD = 1;
+    int SD = 0;
+    int DD = 0;
+    int PD = 0;
+    int ID = 1;
+    int OD = 1;
+    int nbits = 8;
+    int g  = c % GC;
+    int oc = c / GC;
+    int oh = y;
+    int ow = x;
+    for (int od = 0; od < OD; od++) {
+        int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
+        int res = 0;
+        for (int ic = 0; ic < IC / GC; ic++) {
+            for (int kd = 0; kd < KD; kd++) {
+                for (int kh = 0; kh < KH; kh++) {
+                    for (int kw = 0; kw < KW; kw++) {
+                        int widx = g * OC / GC * IC / GC * KD * KH * KW
+                                   + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
+                                   + kh * KW + kw;
+                        int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
+                        int s;
+                        int iw = ow * SW - PW + kw * DW;
+                        int ih = oh * SH - PH + kh * DH;
+                        int id = od * SD - PD + kd * DD;
+                        if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
+                            || id >= (int)ID) {
+                            s = ipad_value;
+                        } else {
+                            int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
+                                       + ih * IW + iw;
+                            s = ((src_data[iidx] > 0.f) ? 1 : 0);
+                        }
+                        res += s ^ w;
+                    }
+                }
+            }
+        }
+        dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
+    }
+}

openvino/vpu_custom_kernels/binary_convolution1x1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
+size 76316

openvino/vpu_custom_kernels/binary_convolution1x1.cl ADDED Viewed

	@@ -0,0 +1,117 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
+__kernel void binary_convolution(
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    __global half *restrict dst_data,
+    float pad_value,
+    int IW,
+    int IH,
+    int IC,
+    int DW,
+    int DH,
+    int GC,
+    int KW,
+    int KH,
+    int PW,
+    int PH,
+    int SW,
+    int SH,
+    int OW)
+{
+    __local half src_local[32 * 1024];
+    __local half dst_local[2 * 1024];
+    const int oh = get_group_id(0);
+    const int oc = get_group_id(1);
+    const int OH = get_global_size(0);
+    const int OC = get_global_size(1);
+    const int gc = oc / (OC / GC);
+    if (oh * SH >= 0 && oh * SH <= IH - 1) {
+        const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
+        event_t e1 = async_work_group_copy_2D2D(
+            src_local, // dst
+            src, // src
+            IW, // num_elements_per_line,
+            IC / GC, // num_lines,
+            IH * IW - IW, // src_line_stride,
+            0, // dst_line_stride,
+            0);
+        wait_group_events(1, &e1);
+    }
+    half pad_value_half = convert_half(pad_value);
+    //padding row
+    if (oh * SH > IH - 1) {
+        __local half *dst = src_local;
+        for (int c = 0; c < IC / GC; c++) {
+            #pragma unroll 8
+            for (int j = 0; j < IW; j++) {
+                dst[j] = pad_value_half;
+            }
+            dst += IW;
+        }
+    }
+    int OWS = SW * OW;
+    ushort8 in;
+    for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
+        ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
+        for (int ic = 0; ic < IC / GC; ++ic) {
+            __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
+            int weight_pos    = oc * IC / GC + ic;
+            ushort w =
+                extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
+            if ((ows8 * 8) <= IW - 1) {
+                in = *((__local ushort8 *)(src));
+            }
+            //padding column
+            if (ows8 * 8 + 7 > IW - 1) {
+                int boundary = (IW - 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in) + offset) = pad_value_half;
+                }
+            }
+            ushort8 w8 = (ushort8)(w);
+            ushort8 cond =
+                (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
+            val += (cond ^ w8);
+        }
+        ushort8 val_shift = val << 1;
+        int boundary      = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
+        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
+            *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
+    wait_group_events(1, &e2);
+}

openvino/vpu_custom_kernels/binary_convolution3x3.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
+size 104136

openvino/vpu_custom_kernels/binary_convolution3x3.cl ADDED Viewed

	@@ -0,0 +1,278 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
+__kernel void binary_convolution(
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    const __global half *restrict dst_data,
+    float pad_value,
+    int IW,
+    int IH,
+    int IC,
+    int DW,
+    int DH,
+    int GC,
+    int KW,
+    int KH,
+    int PW,
+    int PH,
+    int SW,
+    int SH,
+    int OW)
+{
+    __local half src_local[32 * 1024];
+    __local half dst_local[2 * 1024];
+    const int oh = get_group_id(0);
+    const int oc = get_group_id(1);
+    const int OH = get_global_size(0);
+    const int OC = get_global_size(1);
+    const int gc = oc / (OC / GC);
+    if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
+    {
+        event_t e = async_work_group_copy_3D3D(
+            src_local, // dst
+            src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
+            IW, // num_elements_per_line
+            3, // num_lines
+            DH * IW - IW, // src_line_stride
+            0, // dst_line_stride
+            IC / GC, // num planes
+            IH * IW - 3 * DH * IW, // src plane stride
+            0, // dst plane stride
+            0);
+        wait_group_events(1, &e);
+    } else {
+        int ih = oh * SH - 1;
+        if (ih >= 0 && ih <= IH - 1) //dma for first row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+            wait_group_events(1, &e);
+        }
+        ih = oh * SH - 1 + DH;
+        if (ih >= 0 && ih <= IH - 1) //dma for second row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local + IW, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+            wait_group_events(1, &e);
+        }
+        ih = oh * SH - 1 + 2 * DH;
+        if (ih >= 0 && ih <= IH - 1) //dma for third row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local + 2 * IW, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+            wait_group_events(1, &e);
+        }
+    }
+    half pad_value_half = convert_half(pad_value);
+    //padding row
+    if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
+        __local half *dst = src_local;
+        for (int c = 0; c < IC / GC; c++) {
+            #pragma unroll 8
+            for (int j = 0; j < IW; j++) {
+                dst[j] = pad_value_half;
+            }
+            dst += 3 * IW;
+        }
+    }
+    if (oh * SH + DH - 1 > IH - 1) {
+        __local half *dst = src_local + IW;
+        for (int c = 0; c < IC / GC; c++) {
+            #pragma unroll 8
+            for (int j = 0; j < IW; j++) {
+                dst[j] = pad_value_half;
+            }
+            dst += 3 * IW;
+        }
+    }
+    if (oh * SH + DH + DH - 1 > IH - 1) {
+        __local half *dst = src_local + 2 * IW;
+        for (int c = 0; c < IC / GC; c++) {
+            #pragma unroll 8
+            for (int j = 0; j < IW; j++) {
+                dst[j] = pad_value_half;
+            }
+            dst += 3 * IW;
+        }
+    }
+    int OWS = SW * OW;
+    ushort8 in00;
+    ushort8 in01;
+    ushort8 in02;
+    ushort8 in10;
+    ushort8 in11;
+    ushort8 in12;
+    ushort8 in20;
+    ushort8 in21;
+    ushort8 in22;
+    for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
+        ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
+        for (int ic = 0; ic < IC / GC; ++ic) {
+            __local half *src =
+                (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
+            int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
+            ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
+            ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
+            ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
+            ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
+            ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
+            ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
+            ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
+            ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
+            ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
+            if ((ows8 * 8) - 1 <= IW - 1) {
+                in00 = *((__local ushort8 *)(src - IW - DW));
+                in01 = *((__local ushort8 *)(src - IW));
+                in02 = *((__local ushort8 *)(src - IW + DW));
+                in10 = *((__local ushort8 *)(src - DW));
+                in11 = *((__local ushort8 *)(src));
+                in12 = *((__local ushort8 *)(src + DW));
+                in20 = *((__local ushort8 *)(src + IW - DW));
+                in21 = *((__local ushort8 *)(src + IW));
+                in22 = *((__local ushort8 *)(src + IW + DW));
+            }
+            //padding column
+            if (ows8 * 8 - 1 < 0) {
+                int boundary = 1 - ows8 * 8;
+                boundary     = boundary > 8 ? 8 : boundary;
+                for (int offset = 0; offset < boundary; offset++) {
+                    *((half *)(&in00) + offset) = pad_value_half;
+                    *((half *)(&in10) + offset) = pad_value_half;
+                    *((half *)(&in20) + offset) = pad_value_half;
+                }
+            }
+            if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
+                int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in02) + offset) = pad_value_half;
+                    *((half *)(&in12) + offset) = pad_value_half;
+                    *((half *)(&in22) + offset) = pad_value_half;
+                }
+            }
+            if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
+                int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in01) + offset) = pad_value_half;
+                    *((half *)(&in11) + offset) = pad_value_half;
+                    *((half *)(&in21) + offset) = pad_value_half;
+                }
+            }
+            if ((ows8 * 8 + 7) - 1 > IW - 1) {
+                int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in00) + offset) = pad_value_half;
+                    *((half *)(&in10) + offset) = pad_value_half;
+                    *((half *)(&in20) + offset) = pad_value_half;
+                }
+            }
+            ushort8 w00 = (ushort8)(w0);
+            ushort8 w01 = (ushort8)(w1);
+            ushort8 w02 = (ushort8)(w2);
+            ushort8 w10 = (ushort8)(w3);
+            ushort8 w11 = (ushort8)(w4);
+            ushort8 w12 = (ushort8)(w5);
+            ushort8 w20 = (ushort8)(w6);
+            ushort8 w21 = (ushort8)(w7);
+            ushort8 w22 = (ushort8)(w8);
+            ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            val += (cond0 ^ w00);
+            val += (cond1 ^ w01);
+            val += (cond2 ^ w02);
+            val += (cond3 ^ w10);
+            val += (cond4 ^ w11);
+            val += (cond5 ^ w12);
+            val += (cond6 ^ w20);
+            val += (cond7 ^ w21);
+            val += (cond8 ^ w22);
+        }
+        ushort8 val_shift = val << 1;
+        int boundary      = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
+        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
+            *(dst_local + ow) =
+                (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
+    wait_group_events(1, &e2);
+}

openvino/vpu_custom_kernels/convolution1x1_chw.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
+size 9220

openvino/vpu_custom_kernels/convolution1x1_chw.cl ADDED Viewed

	@@ -0,0 +1,114 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+__kernel void Convolution1x1_NCHW(
+    const __global half *in,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC)
+{
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+    event_t e1 = async_work_group_copy_2D2D(
+        in_local, // dst
+        in + get_group_id(0) * IW, // src
+        IW, // num_elements_per_line,
+        IC, // num_lines,
+        IW * IH - IW, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+    wait_group_events(1, &e1);
+    int oh = get_global_id(0);
+    int oc = get_global_id(1);
+    int stride;
+    int write_output = 0;
+    __global half *src;
+    __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
+    __global half *w1  = (__global half *)(&w[oc * IC]);
+    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
+        uint iw = ow;
+        uint ih = oh;
+        half8 val8_0 = 0.0f;
+        __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
+        __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
+        __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
+        __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
+        __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
+        __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
+        __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
+        __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
+        for (uint ic = 0; ic < IC / 8; ic++) {
+            val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
+            val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
+            val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
+            val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
+            val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
+            val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
+            val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
+            val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
+        }
+        for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+            val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
+        }
+        *((__local half8 *)&out_local[ow + 0]) = (val8_0);
+    }
+    uint iw = (OW & (~0x7));
+    uint ih = oh;
+    half8 val8_0 = 0.0f;
+    __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
+    __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
+    __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
+    __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
+    __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
+    __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
+    __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
+    __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
+    for (uint ic = 0; ic < IC / 8; ic++) {
+        val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
+        val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
+        val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
+        val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
+        val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
+        val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
+        val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
+        val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
+    }
+    for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+        val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
+    }
+    for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
+        out_local[ow + 0] = (val8_0[ow % 8]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
+}

openvino/vpu_custom_kernels/convolution1x1_hwc.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
+size 4304

openvino/vpu_custom_kernels/convolution1x1_hwc.cl ADDED Viewed

	@@ -0,0 +1,126 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+__kernel void Convolution1x1_NHWC(
+    const __global half *in,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC)
+{
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+    const int sizeAct = IW * IC;
+    event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
+    wait_group_events(1, &e1);
+    int oh = get_global_id(0);
+    int oc = get_global_id(1);
+    int stride;
+    int write_output = 0;
+    __global half *src;
+    __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
+    __global half *w1  = (__global half *)(&w[oc * IC]);
+    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
+        uint iw = ow;
+        uint ih = oh;
+        half8 val8_0 = 0.0f;
+        half8 val8_1 = 0.0f;
+        half8 val8_2 = 0.0f;
+        half8 val8_3 = 0.0f;
+        half8 val8_4 = 0.0f;
+        half8 val8_5 = 0.0f;
+        half8 val8_6 = 0.0f;
+        half8 val8_7 = 0.0f;
+        __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
+        __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
+        __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
+        __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
+        __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
+        __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
+        __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
+        __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
+        for (uint ic = 0; ic < IC / 8; ++ic) {
+            val8_0 += (in8_0[ic]) * (w8[ic]);
+            val8_1 += (in8_1[ic]) * (w8[ic]);
+            val8_2 += (in8_2[ic]) * (w8[ic]);
+            val8_3 += (in8_3[ic]) * (w8[ic]);
+            val8_4 += (in8_4[ic]) * (w8[ic]);
+            val8_5 += (in8_5[ic]) * (w8[ic]);
+            val8_6 += (in8_6[ic]) * (w8[ic]);
+            val8_7 += (in8_7[ic]) * (w8[ic]);
+        }
+        half val_0 = 0.0f;
+        half val_1 = 0.0f;
+        half val_2 = 0.0f;
+        half val_3 = 0.0f;
+        half val_4 = 0.0f;
+        half val_5 = 0.0f;
+        half val_6 = 0.0f;
+        half val_7 = 0.0f;
+        for (uint ic = IC & (~0x7); ic < IC; ++ic) {
+            val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
+            val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
+            val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
+            val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
+            val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
+            val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
+            val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
+            val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
+        }
+        out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
+        out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
+        out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
+        out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
+        out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
+        out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
+        out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
+        out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
+    }
+    for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
+        uint iw = ow;
+        uint ih = oh;
+        half8 val8 = 0.0f;
+        __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
+        for (uint ic = 0; ic < IC / 8; ++ic) {
+            val8 += (in8[ic]) * (w8[ic]);
+        }
+        half val = 0.0f;
+        for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+            val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
+        }
+        out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
+}

openvino/vpu_custom_kernels/convolution3x3.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851
+size 5748

openvino/vpu_custom_kernels/convolution3x3.cl ADDED Viewed

	@@ -0,0 +1,158 @@

+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+__kernel void Convolution3x3(
+    const __global half *in_param,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC,
+    int KX,
+    int KY,
+    int stride_x,
+    int stride_y,
+    int pad_x,
+    int pad_y,
+    int dilation_x,
+    int dilation_y)
+{
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+    __local half w_local[8 * 1024];
+    const int sizePlane = IW * IH;
+    event_t e1          = async_work_group_copy_2D2D(
+        in_local, // dst
+        in_param + get_group_id(0) * stride_y * IW, // src
+        3 * IW, // num_elements_per_line,
+        IC, // num_lines,
+        IW * IH - 3 * IW, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+    wait_group_events(1, &e1);
+    const int sizeWeight = IC * 3 * 3;
+    e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
+    wait_group_events(1, &e1);
+    int oh = get_global_id(0);
+    int oc = get_global_id(1);
+    __local half *in = (__local half *)in_local + 1;
+    int stride;
+    int write_output = 0;
+    __local half *src;
+    if ((stride_x == 1) && (stride_y == 1)) {
+        stride       = OW / 8;
+        write_output = 1;
+    }
+    if ((stride_x == 2) && (stride_y == 2)) {
+        stride       = OW / 4;
+        write_output = 2;
+    }
+    for (int ow = 0; ow < stride; ow++) {
+        float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+        for (int ic = 0; ic < IC; ++ic) {
+            src             = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
+            __local half *k = (__local half *)(w_local + ic * 3 * 3);
+            half8 aux_in00 = *((__local half8 *)src - 1);
+            half8 aux_in01 = *((__local half8 *)src + 0);
+            half8 aux_in02 = *((__local half8 *)src + 1);
+            half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
+            half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
+            half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
+            half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
+            half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
+            half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
+            short8 in00 = *((short8 *)&aux_in00);
+            short8 in01 = *((short8 *)&aux_in01);
+            short8 in02 = *((short8 *)&aux_in02);
+            short8 in10 = *((short8 *)&aux_in10);
+            short8 in11 = *((short8 *)&aux_in11);
+            short8 in12 = *((short8 *)&aux_in12);
+            short8 in20 = *((short8 *)&aux_in20);
+            short8 in21 = *((short8 *)&aux_in21);
+            short8 in22 = *((short8 *)&aux_in22);
+            short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
+            short8 aux_aux01 = in01;
+            short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2);
+            short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14);
+            short8 aux_aux11 = in11;
+            short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2);
+            short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14);
+            short8 aux_aux21 = in21;
+            short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
+            half8 aux00 = *((half8 *)&aux_aux00);
+            half8 aux01 = *((half8 *)&aux_aux01);
+            half8 aux02 = *((half8 *)&aux_aux02);
+            half8 aux10 = *((half8 *)&aux_aux10);
+            half8 aux11 = *((half8 *)&aux_aux11);
+            half8 aux12 = *((half8 *)&aux_aux12);
+            half8 aux20 = *((half8 *)&aux_aux20);
+            half8 aux21 = *((half8 *)&aux_aux21);
+            half8 aux22 = *((half8 *)&aux_aux22);
+            half8 w00 = (half8)(*(k + 0));
+            half8 w01 = (half8)(*(k + 1));
+            half8 w02 = (half8)(*(k + 2));
+            half8 w10 = (half8)(*(k + 3));
+            half8 w11 = (half8)(*(k + 4));
+            half8 w12 = (half8)(*(k + 5));
+            half8 w20 = (half8)(*(k + 6));
+            half8 w21 = (half8)(*(k + 7));
+            half8 w22 = (half8)(*(k + 8));
+            val += convert_float8(aux00) * convert_float8(w00);
+            val += convert_float8(aux01) * convert_float8(w01);
+            val += convert_float8(aux02) * convert_float8(w02);
+            val += convert_float8(aux10) * convert_float8(w10);
+            val += convert_float8(aux11) * convert_float8(w11);
+            val += convert_float8(aux12) * convert_float8(w12);
+            val += convert_float8(aux20) * convert_float8(w20);
+            val += convert_float8(aux21) * convert_float8(w21);
+            val += convert_float8(aux22) * convert_float8(w22);
+        }
+        if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
+        if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
+    }
+    for (int ow = OW & ~(0x7); ow < OW; ow++) {
+        float val = 0.0f;
+        for (int ic = 0; ic < IC; ++ic) {
+            for (int ky = 0; ky < 3; ++ky) {
+                for (int kx = 0; kx < 3; ++kx) {
+                    int iw = ow * stride_x - pad_x + kx * dilation_x;
+                    int ih = oh * stride_y - pad_y + ky * dilation_y;
+                    val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
+                           * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
+                }
+            }
+        }
+        out_local[ow] = convert_half(val);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
+}