IDM-VTON

Running on Zero

IDM-VTON / preprocess /humanparsing /modules /src /utils /cuda.cuh

IDM-VTON

update IDM-VTON Demo

938e515 8 months ago

1.76 kB

	#pragma once

	/*
	* General settings and functions
	*/
	const int WARP_SIZE = 32;
	const int MAX_BLOCK_SIZE = 1024;

	static int getNumThreads(int nElem) {
	int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
	for (int i = 0; i < 6; ++i) {
	if (nElem <= threadSizes[i]) {
	return threadSizes[i];
	}
	}
	return MAX_BLOCK_SIZE;
	}

	/*
	* Reduction utilities
	*/
	template <typename T>
	__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
	unsigned int mask = 0xffffffff) {
	#if CUDART_VERSION >= 9000
	return __shfl_xor_sync(mask, value, laneMask, width);
	#else
	return __shfl_xor(value, laneMask, width);
	#endif
	}

	__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }

	template<typename T>
	struct Pair {
	T v1, v2;
	__device__ Pair() {}
	__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
	__device__ Pair(T v) : v1(v), v2(v) {}
	__device__ Pair(int v) : v1(v), v2(v) {}
	__device__ Pair &operator+=(const Pair<T> &a) {
	v1 += a.v1;
	v2 += a.v2;
	return *this;
	}
	};

	template<typename T>
	static __device__ __forceinline__ T warpSum(T val) {
	#if __CUDA_ARCH__ >= 300
	for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
	val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
	}
	#else
	__shared__ T values[MAX_BLOCK_SIZE];
	values[threadIdx.x] = val;
	__threadfence_block();
	const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
	for (int i = 1; i < WARP_SIZE; i++) {
	val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
	}
	#endif
	return val;
	}

	template<typename T>
	static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
	value.v1 = warpSum(value.v1);
	value.v2 = warpSum(value.v2);
	return value;
	}