{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 安装第三方库\n", "!pip install transformers" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoTokenizer, GPT2LMHeadModel\n", "\n", "\n", "torch.manual_seed(12046)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n", "model = GPT2LMHeadModel.from_pretrained('gpt2')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[2061, 318, 262, 3139, 286, 2807, 30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 使用分词器对文本进行分词\n", "question = 'What is the capital of China?'\n", "ids = tokenizer(question, return_tensors='pt')\n", "ids" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "tensor([ 2061, 318, 262, 3139, 286, 2807, 30, 198, 198, 464,\n", " 3139, 318, 2807, 13, 632, 338, 262, 4387, 3773, 287,\n", " 262, 995, 11, 290, 340, 468, 257, 3265, 286, 517,\n", " 621, 352, 13, 20, 2997, 661, 13, 2807, 318, 530,\n", " 286, 262, 14162, 3957, 16533, 319, 3668, 11, 351, 281,\n", " 5079, 12396, 286, 720, 16, 13, 17, 12989, 13, 383,\n", " 1499, 318, 1363, 284, 257, 1271, 286, 995, 12, 4871,\n", " 11155, 11, 1390, 262, 2059, 286, 3442, 11, 14727, 11,\n", " 262, 3999, 8581, 286, 5483, 13473, 11, 2807, 338, 2351,\n", " 5800, 5693, 290, 262, 21865, 8581, 329, 5800, 13, 198])\n", "What is the capital of China?\n", "\n", "The capital is China. It's the largest economy in the world, and it has a population of more than 1.5 billion people. China is one of the fastest growing economies on Earth, with an annual GDP of $1.2 trillion. The country is home to a number of world-class universities, including the University of California, Berkeley, the Chinese Academy of Social Sciences, China's National Science Foundation and the Shanghai Academy for Science.\n", "\n" ] } ], "source": [ "# 由于GPT-2的模型效果较差,通过增大num_beams和no_repeat_ngram_size来优化生成的文本。\n", "res = model.generate(**ids, max_length=100, early_stopping=True,\n", " num_beams=3, no_repeat_ngram_size=2)\n", "print(res[0])\n", "print(tokenizer.decode(res[0], skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# 问答示例模版\n", "template = '''\n", "Q: What is the capital of the United Kingdom?\n", "A: London.\n", "\n", "Q: What is the capital of France?\n", "A: Paris.\n", "\n", "Q: %s\n", "A:\n", "'''" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Q: What is the capital of the United Kingdom?\n", "A: London.\n", "\n", "Q: What is the capital of France?\n", "A: Paris.\n", "\n", "Q: What is the capital of China?\n", "A:\n", "\n" ] } ], "source": [ "print(template % question)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "ids2 = tokenizer(template % question , return_tensors='pt')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Q: What is the capital of the United Kingdom?\n", "A: London.\n", "\n", "Q: What is the capital of France?\n", "A: Paris.\n", "\n", "Q: What is the capital of China?\n", "A:\n", "... Beijing.\n" ] } ], "source": [ "# 通过问答示例来获得想要的结果\n", "res2 = model.generate(**ids2, max_length=100, early_stopping=True,\n", " num_beams=3, no_repeat_ngram_size=2)\n", "print(tokenizer.decode(res2[0], skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "中国的首都在哪里?。\n", "\n", "非常和属经啊建城的自己,那么做喜址器,没有陛下解长支提队的现�\n" ] } ], "source": [ "# 中文的效果较差\n", "question_zh = '中国的首都在哪里?'\n", "ids_zh = tokenizer(question_zh, return_tensors='pt')\n", "res_zh = model.generate(**ids_zh, max_length=100, early_stopping=True,\n", " num_beams=3, no_repeat_ngram_size=2)\n", "print(tokenizer.decode(res_zh[0], skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Q: What is the capital of the United Kingdom?\n", "A: London.\n", "\n", "Q: What is the capital of France?\n", "A: Paris.\n", "\n", "Q: 中国的首都在哪里?\n", "A:\n", "The capital city of China, Shanghai, is located in the middle of a vast expanse of land that stretches from the north to the south. It is also known as the \"Great Wall of\n" ] } ], "source": [ "# 即使使用问答示例,也无法获得想要的结果\n", "ids_zh2 = tokenizer(template % question_zh , return_tensors='pt')\n", "res_zh2 = model.generate(**ids_zh2, max_length=100, early_stopping=True,\n", " num_beams=3, no_repeat_ngram_size=2)\n", "print(tokenizer.decode(res_zh2[0], skip_special_tokens=True))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }