代码拉取完成,页面将自动刷新
同步操作将从 Ascend/MindSpeed 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
CLASS NPUDropoutAddLayerNorm(hidden_size, prenorm=False, p=0.0, eps=1e-5, residual_in_fp32=False, dtype=None)
计算逻辑:
norm_result = LayerNorm(Dropout(x0 x rowscale x layerscale) + residual)
参数:
CLASS NPUDropoutAddRMSNorm(hidden_size, prenorm=False, p=0.0, eps=1e-5, residual_in_fp32=False, dtype=None)
计算逻辑:
norm_result = RMSNorm(Dropout(x0 x rowscale x layerscale) + residual)
参数:
ascendspeed_te_ops.npu_dropout_add_layer_norm(x0, weight, residual=None, bias=None, rowscale=None, layerscale=None, p=0., eps=1e-5, prenorm=False, residual_in_fp32=False, is_rms_norm=False, return_dropout_mask=False)
计算逻辑:
is_rms_norm=False:norm_result = LayerNorm(Dropout(x0 x rowscale x layerscale) + residual)
is_rms_norm=True:norm_result = RMSNorm(Dropout(x0 x rowscale x layerscale) + residual)
输入:
支持的输入数据类型组合:
x0 residual weight norm_result
=====================================
fp32 fp32 fp32 fp32
fp16 fp32 fp32 fp16
fp16 fp16 fp32 fp16
bf16 fp32 fp32 bf16
bf16 bf16 fp32 bf16
fp16 fp16 fp16 fp16
bf16 bf16 bf16 bf16
输出:
属性:
import torch
import torch_npu
from ascendspeed.te.ops.dropout_add_layer_norm import NPUDropoutAddLayerNorm
batch_size = 8
seqlen = 512
hidden_size = 192
# 正向接口案例
model = NPUDropoutAddLayerNorm(hidden_size, p=0.37, eps=1e-5,
residual_in_fp32=True, dtype=torch.float).npu()
torch.nn.init.normal_(model.weight)
torch.nn.init.normal_(model.bias)
x0 = torch.randn(batch_size, seqlen, hidden_size, dtype=torch.float, requires_grad=True)
residual = torch.randn_like(x0, dtype=torch.float, requires_grad=True)
norm_result, _, _ = model(x0.npu(), residual.npu())
# 反向接口案例
g = torch.randn_like(norm_result) / batch_size
norm_result.backward(g)
x0_grad = x0.grad
residual_grad = residual.grad
weight_grad = model.weight.grad
bias_grad = model.bias.grad
import torch
import torch_npu
from ascendspeed.te.ops.dropout_add_rms_norm import NPUDropoutAddRMSNorm
batch_size = 8
seqlen = 512
hidden_size = 192
# 正向接口案例
model = NPUDropoutAddRMSNorm(hidden_size, p=0.37, eps=1e-5,
residual_in_fp32=True, dtype=torch.float).npu()
torch.nn.init.normal_(model.weight)
x0 = torch.randn(batch_size, seqlen, hidden_size, dtype=torch.float, requires_grad=True)
residual = torch.randn_like(x0, dtype=torch.float, requires_grad=True)
norm_result, _, _ = model(x0.npu(), residual.npu())
# 反向接口案例
g = torch.randn_like(norm_result) / batch_size
norm_result.backward(g)
x0_grad = x0.grad
residual_grad = residual.grad
weight_grad = model.weight.grad
import torch
import torch_npu
import ascendspeed_te_ops
batch_size = 8
seqlen = 512
hidden_size = 192
# 正向接口案例
x0 = torch.randn(batch_size, seqlen, hidden_size, dtype=torch.float, requires_grad=True)
residual = torch.randn_like(x0, dtype=torch.float, requires_grad=True)
weight = torch.ones(hidden_size, dtype=torch.float, requires_grad=True)
bias = torch.zeros(hidden_size, dtype=torch.float)
rowscale = torch.empty(batch_size, seqlen, dtype=torch.float)
survival_rate = 0.87
rowscale = rowscale.bernoulli_(survival_rate) / survival_rate
layerscale = torch.randn(hidden_size, dtype=torch.float, requires_grad=True)
norm_result, pre_norm_result, mask_result = ascendspeed_te_ops.npu_dropout_add_layer_norm(
x0.npu(),
weight.npu(),
residual.npu(),
bias.npu(),
rowscale.npu(),
layerscale.npu(),
0.37, # p
1e-5, # eps
True, # prenorm
True, # residual_in_fp32
False, # is_rms_norm
True, # return_dropout_mask
)
# 反向接口案例
g = torch.randn_like(norm_result) / batch_size
(norm_result * torch.sigmoid(pre_norm_result)).backward(g)
x0_grad = x0.grad
residual_grad = residual.grad
weight_grad = weight.grad
layerscale_grad = layerscale.grad
import torch
import torch_npu
import ascendspeed_te_ops
from ascendspeed_te_ops import npu_dropout_add_layer_norm
from ascendspeed.te.ops.dropout_add_layer_norm import NPUDropoutAddLayerNorm
batch_size = 8
seqlen = 512
hidden_size = 192
# 正向接口案例
x0 = torch.randn(batch_size, seqlen, hidden_size, dtype=torch.float, requires_grad=True)
residual = torch.randn_like(x0, dtype=torch.float, requires_grad=True)
rowscale = torch.empty(batch_size, seqlen, dtype=torch.float)
survival_rate = 0.87
rowscale = rowscale.bernoulli_(survival_rate) / survival_rate
layerscale = torch.randn(hidden_size, dtype=torch.float, requires_grad=True)
layer_norm_cls = NPUDropoutAddLayerNorm
layer_norm_func = npu_dropout_add_layer_norm
model = layer_norm_cls(hidden_size, prenorm=True, p=0.37, eps=1e-5,
residual_in_fp32=True, dtype=torch.float).npu()
torch.nn.init.normal_(model.weight)
torch.nn.init.normal_(model.bias)
norm_result, pre_norm_result, mask_result = layer_norm_func(
x0.npu(),
model.weight.npu(),
residual.npu(),
model.bias.npu(),
rowscale.npu(),
layerscale.npu(),
model.p,
model.eps,
model.prenorm,
model.residual_in_fp32,
False, # is_rms_norm
True, # return_dropout_mask
)
# 反向接口案例
g = torch.randn_like(norm_result) / batch_size
(norm_result * torch.sigmoid(pre_norm_result)).backward(g)
x0_grad = x0.grad
residual_grad = residual.grad
weight_grad = model.weight.grad
bias_grad = model.bias.grad
layerscale_grad = layerscale.grad
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。