{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# 安装第三方库\n", "!pip install transformers datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "\n", "tokenizer_gpt2 = AutoTokenizer.from_pretrained('gpt2')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# 分词效果示例,三段文本表示的意思是相近的\n", "text_fr = '''Évariste Galois (/ɡælˈwɑː/; français : [evaʁist ɡalwa] ; 25 octobre 1811 - 31 mai 1832) était un mathématicien français et un militant politique. Alors qu'il était encore adolescent, il parvint à déterminer une condition nécessaire et suffisante pour qu'un polynôme soit résoluble par des radicaux, résolvant ainsi un problème qui était resté ouvert pendant 350 ans. Son travail posa les fondements de la théorie de Galois et de la théorie des groupes, deux branches majeures de l'algèbre abstraite. Il était un fervent républicain et fut très impliqué dans les troubles politiques qui entourèrent la Révolution française de 1830. En raison de son activisme politique, il fut arrêté à plusieurs reprises, purgé une peine de plusieurs mois de prison. Pour des raisons restées obscures, peu de temps après sa libération de prison, il se battit en duel et décéda des blessures qu'il subit.'''\n", "text_en = '''Évariste Galois (/ɡælˈwɑː/; French: [evaʁist ɡalwa]; 25 October 1811 – 31 May 1832) was a French mathematician and political activist. While still in his teens, he was able to determine a necessary and sufficient condition for a polynomial to be solvable by radicals, thereby solving a problem that had been open for 350 years. His work laid the foundations for Galois theory and group theory, two major branches of abstract algebra. He was a staunch republican and was heavily involved in the political turmoil that surrounded the French Revolution of 1830. As a result of his political activism, he was arrested repeatedly, serving one jail sentence of several months. For reasons that remain obscure, shortly after his release from prison he fought in a duel and died of the wounds he suffered.'''\n", "text_zh = '''埃瓦里斯特·伽罗瓦(法语:Évariste Galois,1811年10月25日—1832年5月31日,法语发音: [evaʁist ɡalwa])是一位法国数学家和政治活动家。尽管还在十几岁时,他就能够确定多项式能够通过根式求解的充分必要条件,从而解决了一个悬而未决的问题,该问题已经存在了350年。他的工作奠定了Galois理论和群论的基础,这两个是抽象代数的重要分支。他是一位坚定的共和派,深度参与了1830年法国大革命期间的政治动荡。由于他的政治活动,他多次被逮捕,其中一次入狱数月。由于原因不明,他在刑满释放后不久,参与了一场决斗并因受伤而去世。'''\n", "\n", "texts = {\n", " 'fr': text_fr,\n", " 'en': text_en,\n", " 'zh': text_zh\n", "}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_token_stats(tokenizer):\n", " # 统计文本中的单词数量(如果是中文,则为文本的字数)\n", " str_stats = {}\n", " # 统计分词后的词元数量\n", " token_stats = {}\n", " for (k, v) in texts.items():\n", " text_len = len(v.split()) if k != 'zh' else len(list(v))\n", " token_len = len(tokenizer.encode(v))\n", " str_stats[k] = text_len\n", " token_stats[k] = token_len\n", " print(str_stats)\n", " print(token_stats)\n", " return draw_bar(str_stats, token_stats)\n", " \n", "def draw_bar(str_stats, token_stats):\n", " # 将统计结果可视化\n", " fig = plt.figure(figsize=(6, 6), dpi=80)\n", " plt.rcParams['font.sans-serif'] = ['SimHei']\n", " plt.rcParams['axes.unicode_minus'] = False\n", " plt.rcParams.update({'font.size': 13})\n", " bar_width = 0.1\n", " base = range(len(str_stats))\n", " br_str = [x - bar_width for x in base]\n", " br_token = [x + bar_width for x in base]\n", " plt.bar(br_str, str_stats.values(), color ='g',\n", " width = bar_width * 2, label ='文本长度')\n", " plt.bar(br_token, token_stats.values(), color ='b',\n", " width = bar_width * 2, label ='分词后的长度')\n", " plt.xticks([r for r in base], str_stats.keys(), fontsize=18)\n", " plt.legend(shadow=True)\n", " return fig" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fr': 142, 'en': 132, 'zh': 278}\n", "{'fr': 307, 'en': 176, 'zh': 497}\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaEAAAGRCAYAAADM0gxvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/OQEPoAAAACXBIWXMAAAxOAAAMTgF/d4wjAAAmVklEQVR4nO3de3hV1Z3/8fcXCAGDigXFKtB4xfsw0EoVtNSfVguOOnVaL61W0UGrrVo6zm+kOqOM1rE4Vltbsa3WW2lHpaJV63R0iuNMEe+ONxQUa7mIggYIEBKS9fvjnJxfEhIMQlgkvF/Pcx5P9lp773XiefJh7b32WpFSQpKkHLrlboAkaetlCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlE2P3A1oS3l5edpxxx1zN0OStJEWLFhQm1Iqb61siw2hHXfckfnz5+duhiRpI0XE+22VeTlOkpSNISRJysYQkiRlYwhJkrLZYgcmtFd9fX3uJmgL1L1799xNkNQOnTaE3nvvPRYuXGgIqVXdu3dnl112YaeddsrdFEnr0SlD6L333mPBggXsvvvuVFRUEBG5m6QtSEqJlStX8tZbb1FbW8vAgQNzN0lSGzplCC1cuJDdd9+d7bffPndTtIXafvvt2X333XnjjTdoaGhg8ODBuZskqRWdbmBCfX099fX1VFRU5G6KtnAVFRV069aN+++/n0WLFuVujqRWtCuEIuLSiEgtXntGwT9HxJKIeCsixrTY728i4s1i+aWbsuFegtNHafyONDQ0sGDBgsytkdSa9vaERgDnAjs0eb0FXACcA5wInAX8IiIqASJiODAV+BdgJHByRHx5UzZeebz88ss8/vjjuZvRbt27d2fFihW5myGpFRsSQo+llKqavBqAC4FrUkqPp5T+ANwDnF7c53zg31NKP0spvQ5cDnxjE7e/S3j88cc56qijaGho2OB9hwwZwt13392uujU1NUQEs2fPbrZ9ypQpnHXWWe0eaThjxgzGjRtHSmmD25tLZ2qrtDX5yIEJEbE78AngvojYC3gN+A7wDLAb8GiT6rOAk4rvhwK3tyj7xcY3eT1tvWLzX6JL/7Txf9wOPvhg3nrrLW666SbOP//81s+TEg0NDes8/9K7d2969uy5Tv36+vp16vbq1avZfwGmT5/ORRddxI9+9KNWn62ZPHkykydPZptttml27BUrVlBZWUlE0NDQQHV1NVdddRXf+Ib/zpDUfu0ZHXcg8CbwbWAOcB5wP3BAsfytJnWrgMbxsNu1UtYnIvqmlKpaniQiJgATGn/uqiPfVq9ezeLFi+nZsyfduv3/jujEiRN58803effdd5vVr6uro2fPnsyfP58RI0bQu3fvUlisXr2a2tpaTjzxxNJNeIDa2lo++9nP8p//+Z/rbcu0adM488wzmTp1Kl/60pdardPQ0MAxxxzDHXfcUdpnxIgRpWHP9957L/vvvz/77rvvx/uFSNqqfWQIpZTupxA6AETEPwBfBg4rbqptUn0V0DhsbW0rZRTLq1o5z3XAdY0/Dxw4sEteP3nuuecYO3Ys22yzDcuWLWPt2rX07t27VP6Tn/yElStX0qNHD3r06MHatWs55ZRTuPXWW1m7dm2p3vLlyxk4cCAVFRWUlZUxadIkzjnnnHa3Y/LkyVx11VX85je/4cgjj1xv3Yhg1apVlJeXc9lll3HjjTeWQuiHP/wh5557LnvuuScppVZ7ZZLUlg0eop0KF9ffA3YBEjCgSfF2wJri+yWtlNGkfKs0cuRIqqqqWLhwIZMmTeKLX/wiVVVVzV6DBg1i6tSprFixgtWrV3Prrbeuc5w777yTUaNGMXjwYE4++eRST6WlhoYGli1bRk1NDTU1NQCsWbOGAQMGcP/99zNq1KhSWU1NDStWrGDlypWl/VNKlJWVMWbMGIYOHcq8efMYP348Q4cOZejQoTz33HNMnDiRoUOH8rOf/axjfmmSSiI276ujfWQIRcQVEXFBk5+3pXApbh7wPIWRb42GA40r0c1qpWw18MFGtrnLGDt2LE888QS1tf+/wzh37lz+/Oc/c/jhh7e5X01NDddeey1/93d/B8Do0aN5//33Wx2x9s4779CvXz/69etH//79ARg2bBjnnXceRxxxBDvssAP9+/cvvfr168ell17a7Fzl5eXMmDGDl156id12240rrriCe++9t3Qp7vvf/z6vvPJKm/ezJKkt7bkn9Bzw84iYB3wIXAosB/4d2B24JCIeArYHzga+V9zvbuDRiLiRwr2kS4BHiqPqBOyzzz7st99+3HPPPXz1q18F4Gc/+xnHHXcc/fr1a3O/a665hgMOOIAjjjgCKFwumzx5Mueffz5PPfVUs0EElZWVzS7jRQSvvPIKlZWVnHnmmfTr149rr722zXNVV1fTdJn1k046iZkzZzJz5kwAPvOZz1BZWfmxPr8kteueUEQMAW4BtqEwGu7QlNLyiLgB+D8Uej89KfR+phT3mxUR11MYRbcaqAZGd8Bn6NT+8R//kbPPPpvjjz+eDz/8kB//+MfMmDGjzfp/+MMfuPHGG5k1a1az7ccffzx33XUXp556Kvfccw9lZWUfee4LLriAww47jPHjx7P33nu3WmfhwoV8/vOf5+qrr+aWW26hrq6Ovn37lh4EXbNmDU8++SSPPfZYlx1MIqnjtOueUErp+ymlnVJKfVJKJ6SU3i5urwGOLr6OAY5IKdU22e8yCpfuTgH2SynN3dQfoLM76qij+NznPsdJJ53E1772Nc444ww+/elPt1r3+eef58tf/jI//elP2WWXXZqV1dfXc9NNN7Fo0SLGjBnDkiVLPvLcQ4YM4fTTT+fEE09k6dKlrdaZM2cOgwYN4pJLLmHu3LkceeSRHHvssbzwwgs8/vjj1NfXc+mllxpAkj6WjZ7AtDhQYeZ6yudQuBynNlx11VUMGTKE+vp6pkyZ0mqdu+66i3PPPZcbbriB1atX079/f3r16kVVVRWnn3463bp1o6KigldffZVTTjmFUaNG8cwzz9CnT591jrVs2TLGjx/PM888wxNPPMExxxzDoYceytSpUxk+fHip3tKlS3nxxRcZMWJEadv111/P6NGjiQgee+wxxo0bxwknnLDJfyeStg6dbgLTrqS2tpbbbruNkSNHcsEFFzBp0iQOO+wwJk6cyDvvvNOs7ogRI7jjjjs466yzOPXUU6murmbJkiUMGTKEW2+9laqqKhYsWMD222/Pgw8+yEMPPbROAH3wQWFMyOGHH85LL73ElClTqKio4JFHHmHYsGEcfPDBnHLKKbz1VuHxrptvvpmRI0eyww47AIXe1uzZsxk2bBjf+973mDt3LhUVFTz99NMsXrzYtZ0kbbBOuZRDWzbF7AUdbdWqVTz88MM8+uij3Hfffey11148+OCDHHTQQUDh3s4VV1zBnnvuyUEHHcTo0aM54YQTGDVqFHvttdc6x1uzZk1p6HWjbt26sccee6xT99VXX6WsrIyJEydy8cUXlx5uraio4Fe/+hWnnnoqv//97/nUpz4FwMqVK5kwYQIPPPAAl19+OR988AEHHnggJ554It///vd56aWXeOCBB/jlL3/J//7v/1JeXs7cuXPXO6hCkpqKLXVOrYEDB6b58+evs72+vp4XXniBoUOHdsolnBsaGjjttNPYeeedOfPMMznggANarbdgwQKmTZvGo48+ys0338wnP/nJTXL+t99+e4NHs9XV1bFo0aL1rsmTUmLp0qWlYeBbgsbvysyZM9lvv/1KowmlzmxzLyCwKSIiIhaklFpdXdIQUpdlCKkr6moh5D0hSVI2hpAkKRtDSJKUjSEkScrGEJIkZWMIZda4Ymqj1157rc2HPp999ll+8IMflH6ura3dpMtWt3XeFStWcNtttzVb4gHg9ttv54033thk51+fl19+udVZwiV1boZQZtOnT+cv//IvWbt2LWvXrmXs2LH827/9W6t1KyoqmDhxIq+//jpQmPvtU5/6FJWVlc1egwcPJiJYvXp1ad+UEtXV1SxevJjXX3+dGTNmMHXqVK655hr+9m//lkMOOYSddtqpNFtCUy+//DLf/va3mw2Jb2ho4Dvf+Q5PPfXUej9fTU0NEcHs2bObbZ8yZQpnnXVWu2dZmDFjBuPGjdukoSspvy41Y0Jnk1Ji0qRJjBkzhh49Cv8rrrzySi6++GKOPvrodWYe2Gefffirv/orpk6dyhVXXMG8efNaPe6SJUvYcccd6dWrV2nbRRddxO23386uu+7Ktttuyyc+8QkGDBjAzjvvzNChQzn22GPZZZdd+MQnPrHO8V599VWOPfbYZsf77//+b1JKfPrTn2bu3MK8tOXl5QwaNKjZvo37NN13+vTpXHTRRfzoRz9q9VmvyZMnM3ny5GZLUtTX17NixQoqKyuJCBoaGqiuruaqq67iG9/4Ruu/YElbvC4VQpv7IS7YuAe5brrpJt59910mTpxY2ta4lPdJJ53EAw880OwPMRT+QO+6664ArF69ml69epWWVUgpsWrVqlLdaPILaWho4IwzzuD6669vd/sWLlzIQQcdVOpR/e53v6N79+4sXryYX/ziF5SXl5cmL121ahU777zzR/aMpk2bxplnnsnUqVP50pe+1GqdhoYGjjnmmNJqsdOmTWPEiBGlJcUbF9Pbd9992/1ZJG2ZvByXyezZs7n44ou54YYb2HbbbUvbI4K77rqLefPmcfTRR/Pee+8BhXncampqGDBgQGnOtwMOOGCdS3FtzQpQW1tLXV0d1dXV631VVVWV7v306NGDpUuXsnLlSlauXMkzzzzDihUreO+997jvvvuYOXMms2fPZvbs2fz0pz+lvLx8vZ958uTJnHXWWfzmN79pM4Ca/h5WrVpFfX09l112WbN7Tz/84Q95/vnnqaura7YqraTOp0v1hDqLhQsXMnbsWL785S9z3HHHNVv5FKB///784Q9/4LjjjmPIkCFcffXVzJ8/nzlz5vDggw/y8MMP87nPfY4333yz1eO3tpZQXV0dt99+O7/85S/X27Y1a9Zw8cUXM2nSpFLYNdWtWze++93vcvzxx5cmOm08fuMlt4aGBlasWNEslNasWcOAAQO4//77GTFiRLNJV+vq6kpLUUChR1dWVsaYMWNYunQp8+bNY/z48aVZwefOncvEiRO56qqrOO+881xWXOrEDKHNLKXECSecwN57702PHj3o3bt3m3XvvPNOnnjiCVavXs2VV14JFJbrbvzjPn36dE4//XQGDx5MQ0MDixYt4sMPP2z1WFVVVfz93/89kyZN2ujPMGHCBJYtW8bJJ5/MnXfeSVlZGbW1taUQeuedd9hzzz0pLy8vXRIcNmxYabBEz549m90Lqq2t5fzzzy+N/KupqaG8vLy0wux+++3Hd7/73dK6Rl/96lf5zne+w1e+8pWN/iyS8jKENrOIYPr06aUBAD/+8Y+ZMGECNTU13HzzzaV6ffr0YZ999uFrX/vaOsdoHMRQXl7OEUccwfTp06murmbPPfds87xLlizhk5/8JPfeey/nnntum/V+8pOfrPPHvW/fvgCloeT77rsvdXV1PP3006U1jmpqakohVFlZ2ax3FxG88sorVFZWcuaZZ9KvXz+uvfbaNttQXV3NjjvuWPr5pJNOYubMmcycWVg78TOf+cwGzwQuactkCGXQcmnuF154gVNPPbUULkuWLGHNmjXrXToBmg88+Civv/46u+22G8uXL+eAAw4o9TKaGj16dKtDoKuqqoDCMhCNS0+UlZVx4YUXMnnyZMaNG8fq1avXGUTRmgsuuIDDDjuM8ePHs/fee7daZ+HChXz+85/n6quv5pZbbqGuro6+ffuWPu+aNWt48skneeyxx1xWXOrkHJiQ2RtvvMFTTz3FF7/4xdK2hQsX0qtXr2a9gaYaGhq47rrraGho4Pe//z2VlZXsv//+bd6knzdvHu+//37pktj6bMjyGKeddhqjRo1i1apVVFdXtysQhgwZwumnn86JJ57I0qVLW60zZ84cBg0axCWXXMLcuXM58sgjOfbYY3nhhRd4/PHHqa+v59JLLzWApC7AnlBGy5Yt4+tf/zonn3wyu+++e2n7/PnzGThwYKuBkVLim9/8Jv3792fQoEF84QtfYPr06axcubI0dLulO+64g+HDh7PTTjttUO+pNU17SjvssAM///nPAfjwww/XGwrLli1j/PjxPPPMMzzxxBMcc8wxHHrooUydOpXhw4eX6i1dupQXX3yxdP8H4Prrr2f06NFEBI899hjjxo0rDQ2X1LnZE8rk2WefZeTIkVRUVDBlyhSgcBnuqaee4tZbb2W//fZbZ59FixaxaNEidtllF+6//36OOuoo/vVf/xUozKbQeNls6dKlpbB58803uf766zn77LOBQi9q5syZDBw4cJ3XzJkzm93LaXzft29f+vbty0EHHURdXd067aqtreWPf/xjsyBt9MEHHwBw+OGH89JLLzFlyhQqKip45JFHGDZsGAcffDCnnHJKaaaGm2++mZEjR7LDDjsAhYdUZ8+ezbBhw/je977H3Llzqaio4Omnn2bx4sXtnnFB0pbJEMrgwQcf5NBDD2XMmDE89NBDpaHJdXV1HHPMMSxfvpx/+Zd/WWe/hoYG/vqv/5pf//rXlJWV0bdvX/bYY49mdb7+9a8zatQovvCFLwCUBgGcddZZQCFYDjnkEObPn7/O65BDDqG6urp0rO7duzN27Fiqqqqoqqpi3rx5peNAoXdz0EEHMXLkSIYOHcopp5yyTptfffVVysrKmDhxIv/zP//DwQcfDBRC81e/+hXTp0+nf//+peHeK1euZMKECTzwwAMMGzaMPfbYg0mTJjFy5EiWLl3KtGnT+NOf/sS3vvUtdtttN/r379/mZT1JWz6X985k8eLFDBgwYJMf99lnn6W2tpYRI0a0+pzPqlWrWLlyZZv3mzZUXV0dZWVl663z9ttvb/Botrq6OhYtWrTewRkpJZYuXUr//v1bLXd5b3VFXW15b+8JZdIRAQQ0u7/Smm222aZdo9ja66MCCPhYw6nLysraNTqwrQCS1Dl4OU6SlI0hJEnKxhCSJGXT6UKocehx09VIpdY0fke21ME3kjrhwIRu3bqxzTbb8OabbzJ48GB69uy50Q9gqmtJKVFbW8vbb79NXV0dDQ0NfkekLVSnCyGAvffem9dee41XX33VPy5qVUqJ5cuXs3jxYtauXdtszSZJW45OGULdu3ensrKSe+65h+rqavr3728YqSSlVLoEt2zZMnr27FlalVXSlqXTPazaVFVVFdOnT+eDDz4whLSOlBK9evXiuOOOM4TUZXS1h1U7dQhB4cn6FStWuMyz1tGjRw+23Xbbj1x2XOpMuloIdcrLcU2VlZWVFoiTJHUunW6ItiSp6zCEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKZoNDKCKmRMRtxffbR8Q9EbE8ImZGxJ5N6kVE/HNELImItyJizCZstySpC9igEIqI0cD4JptuAwYBnwF+DfwmIroXyy4AzgFOBM4CfhERlRvVWklSl9LuEIqI3sDPgdeKPw8GTgDOTym9nlK6oXi8kcVdLgSuSSk9nlL6A3APcPombLskqZPbkJ7QlcArFMIEYCjwYUrp2SZ1ZgEHR8R2wG7Aoy3LPn5TJUldTbtCKCJGAKdRuLzWaDtgXouqVcDAYhnAW62UtXWOCRExv/FVXV3dnqZJkjqxjwyhiOgJ3AJ8O6X0bpOitUBti+qrgIpiGS3KG8talVK6LqU0sPHVp0+f9rRfktSJtacndCkwJ6X0yxbblwADWmzbDlgDfACkFuWNZZIkAdCjHXVOA3aKiKriz70ohNcwYNeI2DWltKBYNhx4MKVUGxHPUxik8E6TsvmbrOWSpE6vPT2hw4D9KQxEGApMAR4AxgD/CUyKiG4R8QXgUOB3xf3uBi6JiO0iYhBwNvDwJm29JKlT+8ieUEqpWe+l2CPqm1KaHxHfAv4deB/oC0xOKb1YrHoD8H8o9H56UhgdN2WTtVyS1OlFSmnjDhDRi0Jv6b0mAdRYFsBngXLgiZRSfXuPO3DgwDR/vlfvJKmpiM17vo2MCAAiYkFKqdXR0e25J7ReKaUa4D/aKEvAzI09hySpa3ICU0lSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNj02pHJE9AYOABanlN7pmCZ1XhGb93wpbd7zSdKm1u6eUESMBv4M3ALMjogfFrf3jIibI6IqIl6KiINb7HdeRCyIiIURMW4Ttl2S1Mm1qycUET2Bu4DTUkq/i4i9KATRT4BzgCOA0cDewLSI2D+ltDwijgeuBU4F/gT8NiJeSSnN6oDPIknqZNrbE+oL/HNK6XcAKaU5wDJgAIUQ+oeU0gsppbuBF4ETivtdCNySUpqeUnoeuL5YX5Kk9oVQSum9lNLNjT9HxPlANVAF9AYebVJ9FtB4SW7oesokSVu5DR2YsD3wMrArcDiwLfBBSmlZk2pVwPDi++2At1qUDWzj2BOACY0/b7/99hvSNElSJ7RBQ7SLYfN54OfAVGAtUNui2iqgovi+ZXnTspbHvi6lNLDx1adPnw1pmiSpE9rg54RSSnNTSuOBOuAooF9EdG9SZTtgTfH9Egr3jVorkyRt5doVQhHxmYi4tcXmOuAdCpfYmt7nGQ7ML76fBYxso0yStJVrb0/odeCvIuLyiBgYERcAOwIPA9OAyyOiPCIOBP6muB3gbuCbEbFrRPQFLmpSJknayrV3dNxy4FhgDIVAOhk4OqX0PnAZhUB6F3gOuC+l9EBx17uB/wDmUOgBdQOu3JQfQJLUeUXaBHO/REQPYBRQk1J6spXyv6Bwb+i/Uko17TnmwIED0/z5nevKndP2SOponfHvTEQsSCm1OjJ6g4ZotyWltBaYsZ7yFzfFeSRJXYuzaEuSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKZpM8JyRJW6O4YjM/OQpA13pK3Z6QJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCmbdodQRBwdEbMjYm1EvB0RY4vbe0bEzRFRFREvRcTBLfY7LyIWRMTCiBi3qT+AJKnz6tGeShFRCfwamAD8Fvg28KuI+CRwJXAEMBrYG5gWEfunlJZHxPHAtcCpwJ+A30bEKymlWZv6g0iSOp/29oT2By5LKf0ipbSEQrBsCxwAnAP8Q0rphZTS3cCLwAnF/S4EbkkpTU8pPQ9cX6wvSVL7Qiil9FBK6cYmm/YDGoByoDfwaJOyWUDjJbmh6ymTJG3lPu7AhH8CfkUhiD5IKS1rUlYFDCy+3w54q42yZiJiQkTMb3xVV1d/zKZJkjqLDQ6hiDgH+DTwf4G1QG2LKquAiuL7luVNy5pJKV2XUhrY+OrTp8+GNk2S1MlsUAhFxF8C/wqcnlJaACwB+kVE9ybVtgPWFN8vAQa0USZJ2sptyBDtXYD7gO+nlB4sbp5H4RJb0/s8w4H5xfezgJFtlEmStnLtCqGI6AU8DLwAXBcRfSKiDxDANODyiCiPiAOBvynWBbgb+GZE7BoRfYGLmpRJkrZy7XpOCDga+Ivia0WT7WcClwG/B94F+gD3ppQeKJbfDXwRmENhEMM7FJ4rkiSpfSGUUrqfQq+nVcVZEkYBNSmlJ5vsl4AzIuIHFO4N/VdKqWbjmixJ6ira2xNar5TSWmDGespf3BTnkSR1LU5gKknKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKZpPMmCDp44k2J8PqGClt3vNJH8WekCQpG0NIkpSNISRJysYQkiRlYwhJkrIxhCRJ2RhCkqRsDCFJUjaGkCQpG0NIkpSNISRJysYQkiRlYwhJkrIxhCRJ2RhCkqRsDCFJUjaGkCQpG0NIkpSNISRJysYQkiRlYwhJkrIxhCRJ2RhCkqRseuRuQEeKK2IznzFt5vNJUudmT0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGy69BBtaUNs/iH94LB+be3sCUmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsml3CEVE74h4JiJGN9nWMyJujoiqiHgpIg5usc95EbEgIhZGxLhN12xJUlfQruW9I2Jb4G5geIuia4AjgNHA3sC0iNg/pbQ8Io4HrgVOBf4E/DYiXkkpzdpUjZckdW7t7QndDjwHvNO4ISLKgXOAf0gpvZBSuht4ETihWOVC4JaU0vSU0vPA9cX6kiQB7Q+hi1NK3wVSk21DgN7Ao022zQIaL8kNXU+ZJEntC6GU0putbN4O+CCltKzJtipgYJPyt9ooW0dETIiI+Y2v6urq9jRNktSJbczouLVAbYttq4CKNsqblq0jpXRdSmlg46tPnz4b0TRJUmewMSG0BOgXEd2bbNsOWNOkfEAbZZIkbVQIzaNwia3pfZ7hwPzi+1nAyDbKJEn6+CGUUqoHpgGXR0R5RBwI/A3wcLHK3cA3I2LXiOgLXNSkTJKkjZ4x4TJgR+BdCkO470spPVAsuxv4D2AOhR5QN+DKjTyfJKkLadfDqo1SSpUtfl5SnCVhFFCTUnqySVkCzoiIH1C4N/RfKaWajW+yJKmr2KAQak1KaS0wYz3lL27sOSRJXZMTmEqSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsjGEJEnZGEKSpGwMIUlSNoaQJCkbQ0iSlI0hJEnKxhCSJGVjCEmSsunwEIqIQRHx+4hYERGPRMROHX1OSVLn0KEhFBHdgOnAamAo8L/ALzrynJKkzqNHBx9/FHAgMDal9G5EXAosiojBKaV3OvjckqQtXEdfjhsKvJJSehcgpVQLvAAc3MHnlSR1Ah3dE9oOeKvFtipgYMuKETEBmNBkU31EvNtxTesI0Qeo3mxni811JnUcvzPaUJ3yO7NjWwUdHUJrgdoW21YBFS0rppSuA67r4PZ0qIiYn1JaJ2Cltvid0Ybqat+Zjr4ctwQY0GLbdsCaDj6vJKkT6OgQmgUMj4hygIgIYBgwv4PPK0nqBDo0hFJKLwELgUuKm84GdgIe68jzZtSpLycqC78z2lBd6jsTKaWOPUHEZ4EHgAB2AC5MKf24Q08qSeoUOjyEACJiO2AkMDelNKfDTyhJ6hQ2SwhJktQaJzD9mCLi9Ih4KSJqivPiPZy7TZK6voiYERFv527HptLRzwl1SRFxGHA78AjwA6AX8BdZGyVJnZAh9PGMKv73wpTSG1lbIkmdmJfjPp7y4n9bzgYhSdoAhlA7RURlRKSISMA/FTfPa9wWEaOb1E0RcVtEdI+Ib0fEixHxxxztVl4RMbK4ntbyiFgSEQ9HxIFNyt8uXuPfJSKmRcSyiFgcET+MiLKcbdfmFxGXN/mb0vJ1Rou620XELcXv1QcRcVdxJHKn4uW49nsfOK34/kvAXwPfpjA1EcBrLep3A+4HDgceAmZvhjZqCxIRYyh8B/4I/F+gN3AuMCsihqeUGr8zfYHHKcww//fA14BvAfMo3HPU1uM3wNwW2/4R2I3mf2PKKNyTXg18FxgLfBVYClzY8c3cdByi/TFExOUUekO7pZTebqU8UZio9WXgmJTSh5u1gcquOEXVXGA5cDTQUCzai0Io3ZhS+lZxlNOngJ+mlM4p7tsPeBf495TSsZu77dpyRMRE4CrgGymlKcVtM4DPUQihsSmlhmKv+c/AkpTSAbna+3HYE+o43YCvGEBbrb2A3YvvF7dS3nQ05RpgYuMPKaWlEbEI6NNxzdOWLiK+AlwJ3NAYQC1cnFJqAEgp1UXEG8DgzdnGTcEQ6ji/TSn9KXcjlE3j+im3Ab9spXx5k/dzU0pLW5Q3oK1Wcbqz24GHge+0UqU6pfRyi22d8jtjCHWcVbkboKwa7xWuTik92rQgIoa2UVciInajcC9xDnBKSqm+lWot/9HSaTk6TuoYb1AYWHBCRGzfuDEidgGeBi7L1TBtuSKiL4WBTAk4NqW0Im+LOp49IakDpJRSRHyTwr9on42In1IYyfQNCs+XXZWzfdpi3QnsC/wIODyar629OKX0H1la1YEMIamDpJQejojPUxhJeRlQR2Ghx9NSSs9lbZy2VI3PkH2rlbLHgS4XQg7RliRl4z0hSVI2hpAkKRtDSJKUjSEkScrGEJIkZWMISZKyMYQkSdkYQpKkbAwhSVI2/w+AU9oPfAw/pwAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 使用gpt2分词器进行分词的效果\n", "f = get_token_stats(tokenizer_gpt2)\n", "f.savefig('gpt2_tokenizer.png', dpi=200)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'É_var_iste_ Gal_ois_ (/_�_�_æ_l_ˈ_w_�_�_�_�_/_;_ French_:_ [_eva_�_�_ist_ _�_�_al_wa_];_ 25_ October_ 18_11_ –_ 31_ May_ 18_32_)_ was_ a_ French_ mathematician_ and_ political_ activist_._ While_ still_ in_ his_ teens_,_ he_ was_ able_ to_ determine_ a_ necessary_ and_ sufficient_ condition_ for_ a_ po_lyn_omial_ to_ be_ sol_vable_ by_ radicals_,_ thereby_ solving_ a_ problem_ that_ had_ been_ open_ for_ 350_ years_._ His_ work_ laid_ the_ foundations_ for_ Gal_ois_ theory_ and_ group_ theory_,_ two_ major_ branches_ of_ abstract_ algebra_._ He_ was_ a_ staunch_ republican_ and_ was_ heavily_ involved_ in_ the_ political_ turmoil_ that_ surrounded_ the_ French_ Revolution_ of_ 1830_._ As_ a_ result_ of_ his_ political_ activism_,_ he_ was_ arrested_ repeatedly_,_ serving_ one_ jail_ sentence_ of_ several_ months_._ For_ reasons_ that_ remain_ obscure_,_ shortly_ after_ his_ release_ from_ prison_ he_ fought_ in_ a_ duel_ and_ died_ of_ the_ wounds_ he_ suffered_.'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 英文分词效果展示\n", "'_'.join([tokenizer_gpt2.decode(i) for i in tokenizer_gpt2.encode(text_en)])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"É_var_iste_ Gal_ois_ (/_�_�_æ_l_ˈ_w_�_�_�_�_/_;_ fr_an_ç_ais_ :_ [_eva_�_�_ist_ _�_�_al_wa_]_ ;_ 25_ oct_ob_re_ 18_11_ -_ 31_ m_ai_ 18_32_)_ _ét_ait_ un_ math_é_matic_ien_ fr_an_ç_ais_ et_ un_ militant_ polit_ique_._ Al_ors_ qu_'_il_ _ét_ait_ enc_ore_ adolescent_,_ il_ par_v_int_ à_ dé_termin_er_ une_ condition_ n_é_cess_aire_ et_ suff_is_ante_ pour_ qu_'_un_ po_lyn_ô_me_ so_it_ r_és_ol_uble_ par_ des_ rad_ic_aux_,_ r_és_ol_vant_ a_ins_i_ un_ prob_l_è_me_ qui_ _ét_ait_ rest_é_ o_u_vert_ p_endant_ 350_ ans_._ Son_ tra_v_ail_ pos_a_ les_ fond_ements_ de_ la_ th_é_orie_ de_ Gal_ois_ et_ de_ la_ th_é_orie_ des_ group_es_,_ de_ux_ branches_ maj_e_ures_ de_ l_'_alg_è_bre_ ab_stra_ite_._ Il_ _ét_ait_ un_ ferv_ent_ ré_public_ain_ et_ fut_ tr_è_s_ impl_iqu_é_ d_ans_ les_ troubles_ polit_iques_ qui_ ent_our_è_rent_ la_ Ré_v_olution_ fr_an_ça_ise_ de_ 1830_._ En_ ra_ison_ de_ son_ activism_e_ polit_ique_,_ il_ fut_ arr_ê_t_é_ à_ plus_ie_urs_ re_prises_,_ pur_g_é_ une_ pe_ine_ de_ plus_ie_urs_ mo_is_ de_ prison_._ Pour_ des_ ra_isons_ rest_é_es_ obsc_ures_,_ pe_u_ de_ tem_ps_ apr_è_s_ sa_ lib_é_ration_ de_ prison_,_ il_ se_ batt_it_ en_ duel_ et_ dé_cé_da_ des_ bless_ures_ qu_'_il_ sub_it_.\"" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 法语分词效果展示\n", "'_'.join([tokenizer_gpt2.decode(i) for i in tokenizer_gpt2.encode(text_fr)])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'�_�_�_�_�_�_�_�_�_�_�_�_·_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_É_var_iste_ Gal_ois_�_�_�_18_11_�_�_10_�_�_25_�_�_—_18_32_�_�_5_�_�_31_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_ [_eva_�_�_ist_ _�_�_al_wa_]_�_�_�_是_一_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_。_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_��_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_一_�_�_�_��_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_350_�_�_。_�_�_的_�_�_作_�_�_�_�_�_�_G_alo_is_�_�_�_�_�_�_�_�_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_是_�_�_�_�_�_�_代_�_�_的_�_�_�_�_�_�_�_�_�_。_�_�_是_一_�_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_18_30_�_�_�_�_�_�_大_�_�_�_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_。_�_�_�_�_�_�_的_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_中_一_�_�_�_�_�_�_�_�_�_�_�_。_�_�_�_�_�_�_�_�_不_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_不_�_�_�_�_�_�_�_�_�_�_�_一_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_�_。'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 中文分词效果展示\n", "'_'.join([tokenizer_gpt2.decode(i) for i in tokenizer_gpt2.encode(text_zh)])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "# 使用中文语料训练分词器\n", "raw_data = load_dataset('BelleGroup/train_0.5M_CN')\n", "\n", "def get_training_corpus():\n", " # 为了减少运算时间,只选择较少的训练数据\n", " data = raw_data['train'].select(range(10000))\n", " for idx in range(0, len(data), 1000):\n", " samples = data[idx : idx + 1000]\n", " yield samples.get('instruction', []) + samples.get('output', [])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 为了减少运算时间,只将词汇表大小设置为1000,正常应该在5万左右\n", "tokenizer_zh = tokenizer_gpt2.train_new_from_iterator(get_training_corpus(), 1000)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fr': 142, 'en': 132, 'zh': 278}\n", "{'fr': 728, 'en': 557, 'zh': 315}\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 展示新分词器的效果\n", "f = get_token_stats(tokenizer_zh)\n", "f.savefig('zh_tokenizer.png', dpi=200)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'�_�_�_�_�_里_�_�_特_·_�_�_�_�_�_�_�_(_法_语_:_�_�_v_ar_is_t_e_ _G_al_o_is_,_1_8_1_1_年_1_0_�_�_2_5_日_�_�_1_8_3_2_年_5_�_�_3_1_日_,_法_语_发_音_:_ _[_e_v_a_�_�_is_t_ _�_�_al_w_a_]_)_是一_位_法_国_数_学_家_和_�_�_�_�_活_动_家_。_�_�_管_还_在_�_�_�_�_�_�_�_时_,_他_就_能够_确_定_多_项_式_能够_通过_根_式_求_解_的_�_�_分_�_�_要_�_�_件_,_从_而_解_决_了_一个_�_�_而_未_决_的_问题_,_该_问题_已_经_�_�_在_了_3_5_0_年_。_他_的_工作_�_�_定_了_G_al_o_is_理_论_和_�_�_论_的_基_�_�_,_这_两_个_是_�_�_�_�_代_数_的_重要_分_�_�_。_他_是一_位_�_�_定_的_�_�_和_�_�_,_深_度_�_�_与_了_1_8_3_0_年_法_国_大_�_�_�_�_期_间_的�_�_�_�_�_动_�_�_。_由_于_他_的�_�_�_�_�_活_动_,_他_多_次_被_�_�_�_�_,_其_中_一_次_入_�_�_数_�_�_。_由_于_原_因_不_明_,_他_在_�_�_�_�_�_�_放_后_不_�_�_,_�_�_与_了_一_场_决_�_�_并_因_受_�_�_而_去_�_�_。'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 中文分词效果展示\n", "'_'.join([tokenizer_zh.decode(i) for i in tokenizer_zh.encode(text_zh)])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }