Zerx966
/

Gpt

Text Generation

malicious-content

Model card Files Files and versions Community

Gpt / utils.py

Zerx966's picture

Upload 10 files

3ef28b3 verified about 18 hours ago

history blame contribute delete

2.14 kB

	from operator import itemgetter
	from typing import Any, Dict, Iterable, Optional, Tuple, Union
	import math
	import torch


	def attention_mask_func(attention_scores, attention_mask):
	attention_scores.masked_fill_(attention_mask, -10000.0)
	return attention_scores


	@torch.jit.script
	def gelu_impl(x):
	"""OpenAI's gelu implementation."""
	return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))


	def openai_gelu(x):
	return gelu_impl(x)


	@torch.jit.script
	def bias_gelu(bias, y):
	x = bias + y
	return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


	# gradient of tanh approximation of gelu
	# gradient of actual gelu is:
	# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
	@torch.jit.script
	def bias_gelu_back(g, bias, y):
	x = bias + y
	tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
	# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
	ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
	1 + tanh_out
	)
	return ff * g


	class GeLUFunction(torch.autograd.Function):
	@staticmethod
	# bias is an optional argument
	def forward(ctx, input, bias):
	ctx.save_for_backward(input, bias)
	return bias_gelu(bias, input)

	@staticmethod
	def backward(ctx, grad_output):
	input, bias = ctx.saved_tensors
	tmp = bias_gelu_back(grad_output, bias, input)
	return tmp, tmp


	bias_gelu_impl = GeLUFunction.apply



	# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
	@torch.jit.script
	def erf_gelu(x):
	return (
	x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
	)


	def init_method_normal(sigma):

	def init_(tensor):
	return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)

	return init_


	def scaled_init_method_normal(sigma, num_layers):
	std = sigma / math.sqrt(2.0 * num_layers)

	def init_(tensor):
	return torch.nn.init.normal_(tensor, mean=0.0, std=std)

	return init_