nn.RNN(input_size, hidden_size, num_layers=1, nonlinearity=tanh, bias=True, batch_first=False, dropout=0, bidirectional=False)
rnn_layer = nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, ) # 定义模型, 其中vocab_size = 1027, hidden_size = 256
num_steps = 35 batch_size = 2 state = None # 初始隐藏层状态可以不定义 X = torch.rand(num_steps, batch_size, vocab_size) Y, state_new = rnn_layer(X, state) print(Y.shape, len(state_new), state_new.shape)
输出
torch.Size([35, 2, 256]) 1 torch.Size([1, 2, 256])
具体计算过程
H t = i n p u t ∗ W x h + H t − 1 ∗ W h h + b i a s H_t = input * W_{xh} + H_{t-1} * W_{hh} + bias Ht=input∗Wxh+Ht−1∗Whh+bias[batch_size, input_dim] * [input_dim, num_hiddens] + [batch_size, num_hiddens] *[num_hiddens, num_hiddens] +bias
可以发现每个隐藏状态形状都是[batch_size, num_hiddens], 起始输出也是一样的
注意:上面为了方便假设num_step=1