In [1]:
import torch
import torch.nn.functional as F
import string

# 定义字典
char2indx = {s: i for i, s in enumerate(sorted(string.ascii_lowercase))}
char2indx

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [2]:
example = list('love')
example

['l', 'o', 'v', 'e']

In [3]:
# 利用字典,对文本进行数字化
idx = []

for i in example:
 idx.append(char2indx[i])

idx = torch.tensor(idx)
idx, idx.shape

(tensor([11, 14, 21, 4]), torch.Size([4]))

In [4]:
# 使用独热编码,将文本转换为二维张量
num_claz = 26
dims = 5
x = F.one_hot(idx, num_classes=num_claz).float()
x, x.shape

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
 0., 0., 0., 0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
 0., 0., 0., 0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 0., 0., 0., 1., 0., 0., 0., 0.],
 [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 0., 0., 0., 0., 0., 0., 0., 0.]]),
 torch.Size([4, 26]))

In [5]:
# 文本嵌入其实就是张量乘法
x # ( 4, 26)
W = torch.randn((num_claz, dims)) # (26, 5)
x @ W # ( 4, 5)

tensor([[-1.7867, -1.8944, 0.1891, -3.3317, 0.4883],
 [-1.3727, 1.1942, 0.1609, -1.8016, 0.3551],
 [ 0.0374, 0.9542, 0.1898, -0.4440, 1.4332],
 [-1.0798, 0.7559, 0.9129, 0.4616, -0.2050]])

In [6]:
# 与前面张量乘法一致,但更加友好的实现方式
# 因为运算涉及的张量idx维度更少,而且不需要经过独热编码
idx # ( 4)
W # (26, 5)
W[idx] # ( 4, 5)

tensor([[-1.7867, -1.8944, 0.1891, -3.3317, 0.4883],
 [-1.3727, 1.1942, 0.1609, -1.8016, 0.3551],
 [ 0.0374, 0.9542, 0.1898, -0.4440, 1.4332],
 [-1.0798, 0.7559, 0.9129, 0.4616, -0.2050]])

In [7]:
# 文字嵌入的实现示例
class Embedding:
 
 def __init__(self, num_embeddings, embedding_dim):
 self.weight = torch.randn((num_embeddings, embedding_dim))

 def __call__(self, idx):
 self.out = self.weight[idx]
 return self.out

 def parameters(self):
 return [self.weight]

In [8]:
# 正确的使用方式
emb = Embedding(num_claz, dims)
emb(idx), idx.shape, emb(idx).shape

(tensor([[-0.9700, 1.7496, -1.6055, 0.6170, -0.3594],
 [-1.3329, -0.3346, 0.6670, -0.2516, 0.6160],
 [-0.9252, 0.7330, 0.0849, -0.2643, 0.1934],
 [-0.2149, -0.4215, 1.2895, -0.6259, 0.9605]]),
 torch.Size([4]),
 torch.Size([4, 5]))

In [9]:
# 维度更多的例子
# 可以将bidx理解成10个长度等于11的文本(文本的单元是字母)
bidx = torch.randint(0, num_claz, (10, 11))
bidx.shape, emb(bidx).shape

(torch.Size([10, 11]), torch.Size([10, 11, 5]))

In [10]:
# 错误的使用方式
# x是独热编码的结果
emb(x.int())

tensor([[[ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [-0.4473, 1.5996, 1.8102, -1.1696, 0.2618],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448, -1.1311, 0.3100],
 [ 0.5768, 0.0849, -1.4448,