{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import torch\n", "torch.set_printoptions(edgeitems=2, threshold=50)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "with open('../data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = text.split('\\n')\n", "line = lines[200]\n", "line" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([70, 128])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "letter_t = torch.zeros(len(line), 128) # <1> \n", "letter_t.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "for i, letter in enumerate(line.lower().strip()):\n", " letter_index = ord(letter) if ord(letter) < 128 else 0 # <1>\n", " letter_t[i][letter_index] = 1" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',\n", " ['impossible',\n", " 'mr',\n", " 'bennet',\n", " 'impossible',\n", " 'when',\n", " 'i',\n", " 'am',\n", " 'not',\n", " 'acquainted',\n", " 'with',\n", " 'him'])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def clean_words(input_str):\n", " punctuation = '.,;:\"!?”“_-'\n", " word_list = input_str.lower().replace('\\n',' ').split()\n", " word_list = [word.strip(punctuation) for word in word_list]\n", " return word_list\n", "\n", "words_in_line = clean_words(line)\n", "line, words_in_line" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7261, 3394)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_list = sorted(set(clean_words(text)))\n", "word2index_dict = {word: i for (i, word) in enumerate(word_list)}\n", "\n", "len(word2index_dict), word2index_dict['impossible']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 3394 impossible\n", " 1 4305 mr\n", " 2 813 bennet\n", " 3 3394 impossible\n", " 4 7078 when\n", " 5 3315 i\n", " 6 415 am\n", " 7 4436 not\n", " 8 239 acquainted\n", " 9 7148 with\n", "10 3215 him\n", "torch.Size([11, 7261])\n" ] } ], "source": [ "word_t = torch.zeros(len(words_in_line), len(word2index_dict))\n", "for i, word in enumerate(words_in_line):\n", " word_index = word2index_dict[word]\n", " word_t[i][word_index] = 1\n", " print('{:2} {:4} {}'.format(i, word_index, word))\n", " \n", "print(word_t.shape)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([11, 1, 7261])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_t = word_t.unsqueeze(1)\n", "word_t.shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('\\n', 10),\n", " (' ', 32),\n", " ('!', 33),\n", " ('#', 35),\n", " ('$', 36),\n", " ('%', 37),\n", " (\"'\", 39),\n", " ('(', 40),\n", " (')', 41),\n", " ('*', 42),\n", " (',', 44),\n", " ('-', 45),\n", " ('.', 46),\n", " ('/', 47),\n", " ('0', 48),\n", " ('1', 49),\n", " ('2', 50),\n", " ('3', 51),\n", " ('4', 52),\n", " ('5', 53),\n", " ('6', 54),\n", " ('7', 55),\n", " ('8', 56),\n", " ('9', 57),\n", " (':', 58),\n", " (';', 59),\n", " ('?', 63),\n", " ('@', 64),\n", " ('A', 65),\n", " ('B', 66),\n", " ('C', 67),\n", " ('D', 68),\n", " ('E', 69),\n", " ('F', 70),\n", " ('G', 71),\n", " ('H', 72),\n", " ('I', 73),\n", " ('J', 74),\n", " ('K', 75),\n", " ('L', 76),\n", " ('M', 77),\n", " ('N', 78),\n", " ('O', 79),\n", " ('P', 80),\n", " ('Q', 81),\n", " ('R', 82),\n", " ('S', 83),\n", " ('T', 84),\n", " ('U', 85),\n", " ('V', 86),\n", " ('W', 87),\n", " ('X', 88),\n", " ('Y', 89),\n", " ('Z', 90),\n", " ('[', 91),\n", " (']', 93),\n", " ('_', 95),\n", " ('a', 97),\n", " ('b', 98),\n", " ('c', 99),\n", " ('d', 100),\n", " ('e', 101),\n", " ('f', 102),\n", " ('g', 103),\n", " ('h', 104),\n", " ('i', 105),\n", " ('j', 106),\n", " ('k', 107),\n", " ('l', 108),\n", " ('m', 109),\n", " ('n', 110),\n", " ('o', 111),\n", " ('p', 112),\n", " ('q', 113),\n", " ('r', 114),\n", " ('s', 115),\n", " ('t', 116),\n", " ('u', 117),\n", " ('v', 118),\n", " ('w', 119),\n", " ('x', 120),\n", " ('y', 121),\n", " ('z', 122),\n", " ('“', 8220),\n", " ('”', 8221),\n", " ('\\ufeff', 65279)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[(c, ord(c)) for c in sorted(set(text))]\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "108" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ord('l'\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 2 }