IDM-VTON

Running on Zero

IDM-VTON / preprocess /humanparsing /modules /src /inplace_abn_cuda.cu

IDM-VTON

update IDM-VTON Demo

938e515 8 months ago

9.83 kB

	#include <ATen/ATen.h>

	#include <thrust/device_ptr.h>
	#include <thrust/transform.h>

	#include <vector>

	#include "utils/checks.h"
	#include "utils/cuda.cuh"
	#include "inplace_abn.h"

	#include <ATen/cuda/CUDAContext.h>

	// Operations for reduce
	template<typename T>
	struct SumOp {
	__device__ SumOp(const T *t, int c, int s)
	: tensor(t), chn(c), sp(s) {}
	__device__ __forceinline__ T operator()(int batch, int plane, int n) {
	return tensor[(batch * chn + plane) * sp + n];
	}
	const T *tensor;
	const int chn;
	const int sp;
	};

	template<typename T>
	struct VarOp {
	__device__ VarOp(T m, const T *t, int c, int s)
	: mean(m), tensor(t), chn(c), sp(s) {}
	__device__ __forceinline__ T operator()(int batch, int plane, int n) {
	T val = tensor[(batch * chn + plane) * sp + n];
	return (val - mean) * (val - mean);
	}
	const T mean;
	const T *tensor;
	const int chn;
	const int sp;
	};

	template<typename T>
	struct GradOp {
	__device__ GradOp(T _weight, T _bias, const T _z, const T _dz, int c, int s)
	: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
	__device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
	T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
	T _dz = dz[(batch * chn + plane) * sp + n];
	return Pair<T>(_dz, _y * _dz);
	}
	const T weight;
	const T bias;
	const T *z;
	const T *dz;
	const int chn;
	const int sp;
	};

	/***********
	* mean_var
	***********/

	template<typename T>
	__global__ void mean_var_kernel(const T x, T mean, T *var, int num, int chn, int sp) {
	int plane = blockIdx.x;
	T norm = T(1) / T(num * sp);

	T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
	__syncthreads();
	T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;

	if (threadIdx.x == 0) {
	mean[plane] = _mean;
	var[plane] = _var;
	}
	}

	std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
	CHECK_CUDA_INPUT(x);

	// Extract dimensions
	int64_t num, chn, sp;
	get_dims(x, num, chn, sp);

	// Prepare output tensors
	auto mean = at::empty({chn}, x.options());
	auto var = at::empty({chn}, x.options());

	// Run kernel
	dim3 blocks(chn);
	dim3 threads(getNumThreads(sp));
	auto stream = at::cuda::getCurrentCUDAStream();
	AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
	mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
	x.data<scalar_t>(),
	mean.data<scalar_t>(),
	var.data<scalar_t>(),
	num, chn, sp);
	}));

	return {mean, var};
	}

	/**********
	* forward
	**********/

	template<typename T>
	__global__ void forward_kernel(T x, const T mean, const T var, const T weight, const T *bias,
	bool affine, float eps, int num, int chn, int sp) {
	int plane = blockIdx.x;

	T _mean = mean[plane];
	T _var = var[plane];
	T _weight = affine ? abs(weight[plane]) + eps : T(1);
	T _bias = affine ? bias[plane] : T(0);

	T mul = rsqrt(_var + eps) * _weight;

	for (int batch = 0; batch < num; ++batch) {
	for (int n = threadIdx.x; n < sp; n += blockDim.x) {
	T _x = x[(batch * chn + plane) * sp + n];
	T _y = (_x - _mean) * mul + _bias;

	x[(batch * chn + plane) * sp + n] = _y;
	}
	}
	}

	at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
	bool affine, float eps) {
	CHECK_CUDA_INPUT(x);
	CHECK_CUDA_INPUT(mean);
	CHECK_CUDA_INPUT(var);
	CHECK_CUDA_INPUT(weight);
	CHECK_CUDA_INPUT(bias);

	// Extract dimensions
	int64_t num, chn, sp;
	get_dims(x, num, chn, sp);

	// Run kernel
	dim3 blocks(chn);
	dim3 threads(getNumThreads(sp));
	auto stream = at::cuda::getCurrentCUDAStream();
	AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
	forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
	x.data<scalar_t>(),
	mean.data<scalar_t>(),
	var.data<scalar_t>(),
	weight.data<scalar_t>(),
	bias.data<scalar_t>(),
	affine, eps, num, chn, sp);
	}));

	return x;
	}

	/***********
	* edz_eydz
	***********/

	template<typename T>
	__global__ void edz_eydz_kernel(const T z, const T dz, const T weight, const T bias,
	T edz, T eydz, bool affine, float eps, int num, int chn, int sp) {
	int plane = blockIdx.x;

	T _weight = affine ? abs(weight[plane]) + eps : 1.f;
	T _bias = affine ? bias[plane] : 0.f;

	Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
	__syncthreads();

	if (threadIdx.x == 0) {
	edz[plane] = res.v1;
	eydz[plane] = res.v2;
	}
	}

	std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
	bool affine, float eps) {
	CHECK_CUDA_INPUT(z);
	CHECK_CUDA_INPUT(dz);
	CHECK_CUDA_INPUT(weight);
	CHECK_CUDA_INPUT(bias);

	// Extract dimensions
	int64_t num, chn, sp;
	get_dims(z, num, chn, sp);

	auto edz = at::empty({chn}, z.options());
	auto eydz = at::empty({chn}, z.options());

	// Run kernel
	dim3 blocks(chn);
	dim3 threads(getNumThreads(sp));
	auto stream = at::cuda::getCurrentCUDAStream();
	AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
	edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
	z.data<scalar_t>(),
	dz.data<scalar_t>(),
	weight.data<scalar_t>(),
	bias.data<scalar_t>(),
	edz.data<scalar_t>(),
	eydz.data<scalar_t>(),
	affine, eps, num, chn, sp);
	}));

	return {edz, eydz};
	}

	/***********
	* backward
	***********/

	template<typename T>
	__global__ void backward_kernel(const T z, const T dz, const T var, const T weight, const T bias, const T edz,
	const T eydz, T dx, bool affine, float eps, int num, int chn, int sp) {
	int plane = blockIdx.x;

	T _weight = affine ? abs(weight[plane]) + eps : 1.f;
	T _bias = affine ? bias[plane] : 0.f;
	T _var = var[plane];
	T _edz = edz[plane];
	T _eydz = eydz[plane];

	T _mul = _weight * rsqrt(_var + eps);
	T count = T(num * sp);

	for (int batch = 0; batch < num; ++batch) {
	for (int n = threadIdx.x; n < sp; n += blockDim.x) {
	T _dz = dz[(batch * chn + plane) * sp + n];
	T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;

	dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
	}
	}
	}

	at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
	at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
	CHECK_CUDA_INPUT(z);
	CHECK_CUDA_INPUT(dz);
	CHECK_CUDA_INPUT(var);
	CHECK_CUDA_INPUT(weight);
	CHECK_CUDA_INPUT(bias);
	CHECK_CUDA_INPUT(edz);
	CHECK_CUDA_INPUT(eydz);

	// Extract dimensions
	int64_t num, chn, sp;
	get_dims(z, num, chn, sp);

	auto dx = at::zeros_like(z);

	// Run kernel
	dim3 blocks(chn);
	dim3 threads(getNumThreads(sp));
	auto stream = at::cuda::getCurrentCUDAStream();
	AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
	backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
	z.data<scalar_t>(),
	dz.data<scalar_t>(),
	var.data<scalar_t>(),
	weight.data<scalar_t>(),
	bias.data<scalar_t>(),
	edz.data<scalar_t>(),
	eydz.data<scalar_t>(),
	dx.data<scalar_t>(),
	affine, eps, num, chn, sp);
	}));

	return dx;
	}

	/**************
	* activations
	**************/

	template<typename T>
	inline void leaky_relu_backward_impl(T z, T dz, float slope, int64_t count) {
	// Create thrust pointers
	thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
	thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);

	auto stream = at::cuda::getCurrentCUDAStream();
	thrust::transform_if(thrust::cuda::par.on(stream),
	th_dz, th_dz + count, th_z, th_dz,
	[slope] __device__ (const T& dz) { return dz * slope; },
	[] __device__ (const T& z) { return z < 0; });
	thrust::transform_if(thrust::cuda::par.on(stream),
	th_z, th_z + count, th_z,
	[slope] __device__ (const T& z) { return z / slope; },
	[] __device__ (const T& z) { return z < 0; });
	}

	void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
	CHECK_CUDA_INPUT(z);
	CHECK_CUDA_INPUT(dz);

	int64_t count = z.numel();

	AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
	leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
	}));
	}

	template<typename T>
	inline void elu_backward_impl(T z, T dz, int64_t count) {
	// Create thrust pointers
	thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
	thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);

	auto stream = at::cuda::getCurrentCUDAStream();
	thrust::transform_if(thrust::cuda::par.on(stream),
	th_dz, th_dz + count, th_z, th_z, th_dz,
	[] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
	[] __device__ (const T& z) { return z < 0; });
	thrust::transform_if(thrust::cuda::par.on(stream),
	th_z, th_z + count, th_z,
	[] __device__ (const T& z) { return log1p(z); },
	[] __device__ (const T& z) { return z < 0; });
	}

	void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
	CHECK_CUDA_INPUT(z);
	CHECK_CUDA_INPUT(dz);

	int64_t count = z.numel();

	AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
	elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
	}));
	}