Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,359 Bytes
826d651 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
import os
import subprocess
import numpy as np
import multiprocessing as mp
import math
import torch
import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.nn import Module
class DistModule(Module):
def __init__(self, module):
super(DistModule, self).__init__()
self.module = module
broadcast_params(self.module)
def forward(self, *inputs, **kwargs):
return self.module(*inputs, **kwargs)
def train(self, mode=True):
super(DistModule, self).train(mode)
self.module.train(mode)
def average_gradients(model):
""" average gradients """
for param in model.parameters():
if param.requires_grad:
dist.all_reduce(param.grad.data)
def broadcast_params(model):
""" broadcast model parameters """
for p in model.state_dict().values():
dist.broadcast(p, 0)
def dist_init(launcher, backend='nccl', **kwargs):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
if launcher == 'pytorch':
_init_dist_pytorch(backend, **kwargs)
elif launcher == 'mpi':
_init_dist_mpi(backend, **kwargs)
elif launcher == 'slurm':
_init_dist_slurm(backend, **kwargs)
else:
raise ValueError('Invalid launcher type: {}'.format(launcher))
def _init_dist_pytorch(backend, **kwargs):
rank = int(os.environ['RANK'])
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(rank % num_gpus)
dist.init_process_group(backend=backend, **kwargs)
def _init_dist_mpi(backend, **kwargs):
raise NotImplementedError
def _init_dist_slurm(backend, port=10086, **kwargs):
proc_id = int(os.environ['SLURM_PROCID'])
ntasks = int(os.environ['SLURM_NTASKS'])
node_list = os.environ['SLURM_NODELIST']
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(proc_id % num_gpus)
addr = subprocess.getoutput(
'scontrol show hostname {} | head -n1'.format(node_list))
os.environ['MASTER_PORT'] = str(port)
os.environ['MASTER_ADDR'] = addr
os.environ['WORLD_SIZE'] = str(ntasks)
os.environ['RANK'] = str(proc_id)
dist.init_process_group(backend=backend)
def gather_tensors(input_array):
world_size = dist.get_world_size()
## gather shapes first
myshape = input_array.shape
mycount = input_array.size
shape_tensor = torch.Tensor(np.array(myshape)).cuda()
all_shape = [torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)]
dist.all_gather(all_shape, shape_tensor)
## compute largest shapes
all_shape = [x.cpu().numpy() for x in all_shape]
all_count = [int(x.prod()) for x in all_shape]
all_shape = [list(map(int, x)) for x in all_shape]
max_count = max(all_count)
## padding tensors and gather them
output_tensors = [torch.Tensor(max_count).cuda() for i in range(world_size)]
padded_input_array = np.zeros(max_count)
padded_input_array[:mycount] = input_array.reshape(-1)
input_tensor = torch.Tensor(padded_input_array).cuda()
dist.all_gather(output_tensors, input_tensor)
## unpadding gathered tensors
padded_output = [x.cpu().numpy() for x in output_tensors]
output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)]
return output
def gather_tensors_batch(input_array, part_size=10):
# gather
rank = dist.get_rank()
all_features = []
part_num = input_array.shape[0] // part_size + 1 if input_array.shape[0] % part_size != 0 else input_array.shape[0] // part_size
for i in range(part_num):
part_feat = input_array[i * part_size:min((i+1)*part_size, input_array.shape[0]),...]
assert part_feat.shape[0] > 0, "rank: {}, length of part features should > 0".format(rank)
print("rank: {}, gather part: {}/{}, length: {}".format(rank, i, part_num, len(part_feat)))
gather_part_feat = gather_tensors(part_feat)
all_features.append(gather_part_feat)
print("rank: {}, gather done.".format(rank))
all_features = np.concatenate([np.concatenate([all_features[i][j] for i in range(part_num)], axis=0) for j in range(len(all_features[0]))], axis=0)
return all_features
def reduce_tensors(tensor):
reduced_tensor = tensor.clone()
dist.all_reduce(reduced_tensor)
return reduced_tensor
class DistributedSequentialSampler(Sampler):
def __init__(self, dataset, world_size=None, rank=None):
if world_size == None:
world_size = dist.get_world_size()
if rank == None:
rank = dist.get_rank()
self.dataset = dataset
self.world_size = world_size
self.rank = rank
assert len(self.dataset) >= self.world_size, '{} vs {}'.format(len(self.dataset), self.world_size)
sub_num = int(math.ceil(len(self.dataset) * 1.0 / self.world_size))
self.beg = sub_num * self.rank
#self.end = min(self.beg+sub_num, len(self.dataset))
self.end = self.beg + sub_num
self.padded_ind = list(range(len(self.dataset))) + list(range(sub_num * self.world_size - len(self.dataset)))
def __iter__(self):
indices = [self.padded_ind[i] for i in range(self.beg, self.end)]
return iter(indices)
def __len__(self):
return self.end - self.beg
class GivenIterationSampler(Sampler):
def __init__(self, dataset, total_iter, batch_size, last_iter=-1):
self.dataset = dataset
self.total_iter = total_iter
self.batch_size = batch_size
self.last_iter = last_iter
self.total_size = self.total_iter * self.batch_size
self.indices = self.gen_new_list()
self.call = 0
def __iter__(self):
if self.call == 0:
self.call = 1
return iter(self.indices[(self.last_iter + 1) * self.batch_size:])
else:
raise RuntimeError("this sampler is not designed to be called more than once!!")
def gen_new_list(self):
# each process shuffle all list with same seed, and pick one piece according to rank
np.random.seed(0)
all_size = self.total_size
indices = np.arange(len(self.dataset))
indices = indices[:all_size]
num_repeat = (all_size-1) // indices.shape[0] + 1
indices = np.tile(indices, num_repeat)
indices = indices[:all_size]
np.random.shuffle(indices)
assert len(indices) == self.total_size
return indices
def __len__(self):
return self.total_size
class DistributedGivenIterationSampler(Sampler):
def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1):
if world_size is None:
world_size = dist.get_world_size()
if rank is None:
rank = dist.get_rank()
assert rank < world_size
self.dataset = dataset
self.total_iter = total_iter
self.batch_size = batch_size
self.world_size = world_size
self.rank = rank
self.last_iter = last_iter
self.total_size = self.total_iter*self.batch_size
self.indices = self.gen_new_list()
self.call = 0
def __iter__(self):
if self.call == 0:
self.call = 1
return iter(self.indices[(self.last_iter+1)*self.batch_size:])
else:
raise RuntimeError("this sampler is not designed to be called more than once!!")
def gen_new_list(self):
# each process shuffle all list with same seed, and pick one piece according to rank
np.random.seed(0)
all_size = self.total_size * self.world_size
indices = np.arange(len(self.dataset))
indices = indices[:all_size]
num_repeat = (all_size-1) // indices.shape[0] + 1
indices = np.tile(indices, num_repeat)
indices = indices[:all_size]
np.random.shuffle(indices)
beg = self.total_size * self.rank
indices = indices[beg:beg+self.total_size]
assert len(indices) == self.total_size
return indices
def __len__(self):
# note here we do not take last iter into consideration, since __len__
# should only be used for displaying, the correct remaining size is
# handled by dataloader
#return self.total_size - (self.last_iter+1)*self.batch_size
return self.total_size
|