.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
@d2l.add_to_class(MTFraEng) #@save
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i > max_examples: break
parts = line.split('\t')
if len(parts) == 2:
# Skip empty tokens
src.append([t for t in f'{parts[0]} '.split(' ') if t])
tgt.append([t for t in f'{parts[1]} '.split(' ') if t])
return src, tgt
src, tgt = data._tokenize(text)
src[:6], tgt[:6]
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
([['go', '.', ''],
['hi', '.', ''],
['run', '!', ''],
['run', '!', ''],
['who', '?', ''],
['wow', '!', '']],
[['va', '!', ''],
['salut', '!', ''],
['cours', '!', ''],
['courez', '!', ''],
['qui', '?', ''],
['ça', 'alors', '!', '']])
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
@d2l.add_to_class(MTFraEng) #@save
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i > max_examples: break
parts = line.split('\t')
if len(parts) == 2:
# Skip empty tokens
src.append([t for t in f'{parts[0]} '.split(' ') if t])
tgt.append([t for t in f'{parts[1]} '.split(' ') if t])
return src, tgt
src, tgt = data._tokenize(text)
src[:6], tgt[:6]
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
([['go', '.', ''],
['hi', '.', ''],
['run', '!', ''],
['run', '!', ''],
['who', '?', ''],
['wow', '!', '']],
[['va', '!', ''],
['salut', '!', ''],
['cours', '!', ''],
['courez', '!', ''],
['qui', '?', ''],
['ça', 'alors', '!', '']])
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
@d2l.add_to_class(MTFraEng) #@save
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i > max_examples: break
parts = line.split('\t')
if len(parts) == 2:
# Skip empty tokens
src.append([t for t in f'{parts[0]} '.split(' ') if t])
tgt.append([t for t in f'{parts[1]} '.split(' ') if t])
return src, tgt
src, tgt = data._tokenize(text)
src[:6], tgt[:6]
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
([['go', '.', ''],
['hi', '.', ''],
['run', '!', ''],
['run', '!', ''],
['who', '?', ''],
['wow', '!', '']],
[['va', '!', ''],
['salut', '!', ''],
['cours', '!', ''],
['courez', '!', ''],
['qui', '?', ''],
['ça', 'alors', '!', '']])
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
@d2l.add_to_class(MTFraEng) #@save
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i > max_examples: break
parts = line.split('\t')
if len(parts) == 2:
# Skip empty tokens
src.append([t for t in f'{parts[0]} '.split(' ') if t])
tgt.append([t for t in f'{parts[1]} '.split(' ') if t])
return src, tgt
src, tgt = data._tokenize(text)
src[:6], tgt[:6]
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
([['go', '.', ''],
['hi', '.', ''],
['run', '!', ''],
['run', '!', ''],
['who', '?', ''],
['wow', '!', '']],
[['va', '!', ''],
['salut', '!', ''],
['cours', '!', ''],
['courez', '!', ''],
['qui', '?', ''],
['ça', 'alors', '!', '']])
.. raw:: html
.. raw:: html
Let’s plot the histogram of the number of tokens per text sequence. In
this simple English–French dataset, most of the text sequences have
fewer than 20 tokens.
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)
show_list_len_pair_hist(['source', 'target'], '# tokens per sequence',
'count', src, tgt);
.. figure:: output_machine-translation-and-dataset_887557_63_0.svg
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)
show_list_len_pair_hist(['source', 'target'], '# tokens per sequence',
'count', src, tgt);
.. figure:: output_machine-translation-and-dataset_887557_66_0.svg
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)
show_list_len_pair_hist(['source', 'target'], '# tokens per sequence',
'count', src, tgt);
.. figure:: output_machine-translation-and-dataset_887557_69_0.svg
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)
show_list_len_pair_hist(['source', 'target'], '# tokens per sequence',
'count', src, tgt);
.. figure:: output_machine-translation-and-dataset_887557_72_0.svg
.. raw:: html
.. raw:: html
.. _subsec_loading-seq-fixed-len:
Loading Sequences of Fixed Length
---------------------------------
Recall that in language modeling each example sequence, either a segment
of one sentence or a span over multiple sentences, had a fixed length.
This was specified by the ``num_steps`` (number of time steps or tokens)
argument from :numref:`sec_language-model`. In machine translation,
each example is a pair of source and target text sequences, where the
two text sequences may have different lengths.
For computational efficiency, we can still process a minibatch of text
sequences at one time by *truncation* and *padding*. Suppose that every
sequence in the same minibatch should have the same length
``num_steps``. If a text sequence has fewer than ``num_steps`` tokens,
we will keep appending the special “