Skip to content

Commit

Permalink
deploy: cff43a0
Browse files Browse the repository at this point in the history
  • Loading branch information
harrykeightley committed Sep 8, 2023
1 parent 21e8887 commit a441dbb
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 34 deletions.
26 changes: 19 additions & 7 deletions datasets/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -150,24 +150,36 @@ <h2 id="parameters">Parameters</h2>
processor: The processor to apply over the dataset
&#34;&#34;&#34;

def prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
logger.debug(f&#34;Dataset pre prep: {dataset}&#34;)
logger.debug(f&#34;Dataset[train] pre prep: {dataset[&#39;train&#39;][&#39;transcript&#39;]}&#34;)
logger.debug(f&#34;Tokenizer vocab: {processor.tokenizer.vocab}&#34;) # type: ignore

def _prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
# Also from https://huggingface.co/blog/fine-tune-xlsr-wav2vec2
audio = batch[&#34;audio&#34;]

batch[&#34;input_values&#34;] = processor(
audio[&#34;array&#34;], sampling_rate=audio[&#34;sampling_rate&#34;]
).input_values[0]
batch[&#34;input_length&#34;] = len(batch[&#34;input_values&#34;])

with processor.as_target_processor():
batch[&#34;labels&#34;] = processor(batch[&#34;transcript&#34;]).input_ids
batch[&#34;labels&#34;] = processor(text=batch[&#34;transcript&#34;]).input_ids

return batch

return dataset.map(
prepare_dataset,
remove_columns=dataset.column_names[&#34;train&#34;],
column_names = [dataset.column_names[key] for key in dataset.column_names.keys()]
# flatten
columns_to_remove = list(chain.from_iterable(column_names))

dataset = dataset.map(
_prepare_dataset,
remove_columns=columns_to_remove,
num_proc=PROCESSOR_COUNT,
)</code></pre>
)

logger.debug(f&#34;Dataset post prep: {dataset}&#34;)
logger.debug(f&#34;Training labels: {dataset[&#39;train&#39;][&#39;labels&#39;]}&#34;)
return dataset</code></pre>
</details>
</dd>
<dt id="elpis.datasets.process_batch"><code class="name flex">
Expand Down
54 changes: 40 additions & 14 deletions datasets/processing.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ <h1 class="title">Module <code>elpis.datasets.processing</code></h1>
<span>Expand source code</span>
</summary>
<pre><code class="python">import os
from itertools import chain
from pathlib import Path
from typing import Any, Dict, List, Optional

from datasets import Audio, DatasetDict, load_dataset
from loguru import logger
from transformers import Wav2Vec2Processor

PROCESSOR_COUNT = 4
Expand Down Expand Up @@ -88,24 +90,36 @@ <h1 class="title">Module <code>elpis.datasets.processing</code></h1>
processor: The processor to apply over the dataset
&#34;&#34;&#34;

def prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
logger.debug(f&#34;Dataset pre prep: {dataset}&#34;)
logger.debug(f&#34;Dataset[train] pre prep: {dataset[&#39;train&#39;][&#39;transcript&#39;]}&#34;)
logger.debug(f&#34;Tokenizer vocab: {processor.tokenizer.vocab}&#34;) # type: ignore

def _prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
# Also from https://huggingface.co/blog/fine-tune-xlsr-wav2vec2
audio = batch[&#34;audio&#34;]

batch[&#34;input_values&#34;] = processor(
audio[&#34;array&#34;], sampling_rate=audio[&#34;sampling_rate&#34;]
).input_values[0]
batch[&#34;input_length&#34;] = len(batch[&#34;input_values&#34;])

with processor.as_target_processor():
batch[&#34;labels&#34;] = processor(batch[&#34;transcript&#34;]).input_ids
batch[&#34;labels&#34;] = processor(text=batch[&#34;transcript&#34;]).input_ids

return batch

return dataset.map(
prepare_dataset,
remove_columns=dataset.column_names[&#34;train&#34;],
column_names = [dataset.column_names[key] for key in dataset.column_names.keys()]
# flatten
columns_to_remove = list(chain.from_iterable(column_names))

dataset = dataset.map(
_prepare_dataset,
remove_columns=columns_to_remove,
num_proc=PROCESSOR_COUNT,
)</code></pre>
)

logger.debug(f&#34;Dataset post prep: {dataset}&#34;)
logger.debug(f&#34;Training labels: {dataset[&#39;train&#39;][&#39;labels&#39;]}&#34;)
return dataset</code></pre>
</details>
</section>
<section>
Expand Down Expand Up @@ -195,24 +209,36 @@ <h2 id="parameters">Parameters</h2>
processor: The processor to apply over the dataset
&#34;&#34;&#34;

def prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
logger.debug(f&#34;Dataset pre prep: {dataset}&#34;)
logger.debug(f&#34;Dataset[train] pre prep: {dataset[&#39;train&#39;][&#39;transcript&#39;]}&#34;)
logger.debug(f&#34;Tokenizer vocab: {processor.tokenizer.vocab}&#34;) # type: ignore

def _prepare_dataset(batch: Dict) -&gt; Dict[str, List]:
# Also from https://huggingface.co/blog/fine-tune-xlsr-wav2vec2
audio = batch[&#34;audio&#34;]

batch[&#34;input_values&#34;] = processor(
audio[&#34;array&#34;], sampling_rate=audio[&#34;sampling_rate&#34;]
).input_values[0]
batch[&#34;input_length&#34;] = len(batch[&#34;input_values&#34;])

with processor.as_target_processor():
batch[&#34;labels&#34;] = processor(batch[&#34;transcript&#34;]).input_ids
batch[&#34;labels&#34;] = processor(text=batch[&#34;transcript&#34;]).input_ids

return batch

return dataset.map(
prepare_dataset,
remove_columns=dataset.column_names[&#34;train&#34;],
column_names = [dataset.column_names[key] for key in dataset.column_names.keys()]
# flatten
columns_to_remove = list(chain.from_iterable(column_names))

dataset = dataset.map(
_prepare_dataset,
remove_columns=columns_to_remove,
num_proc=PROCESSOR_COUNT,
)</code></pre>
)

logger.debug(f&#34;Dataset post prep: {dataset}&#34;)
logger.debug(f&#34;Training labels: {dataset[&#39;train&#39;][&#39;labels&#39;]}&#34;)
return dataset</code></pre>
</details>
</dd>
</dl>
Expand Down
22 changes: 18 additions & 4 deletions trainer/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ <h2 class="section-title" id="header-submodules">Sub-modules</h2>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="elpis.trainer.metrics" href="metrics.html">elpis.trainer.metrics</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="elpis.trainer.trainer" href="trainer.html">elpis.trainer.trainer</a></code></dt>
<dd>
<div class="desc"></div>
Expand All @@ -62,7 +66,7 @@ <h2 class="section-title" id="header-functions">Functions</h2>
<span>def <span class="ident">train</span></span>(<span>job: <a title="elpis.trainer.job.TrainingJob" href="job.html#elpis.trainer.job.TrainingJob">TrainingJob</a>, output_dir: pathlib.Path, dataset_dir: pathlib.Path, cache_dir: Optional[pathlib.Path] = None, log_file: Optional[pathlib.Path] = None) ‑> pathlib.Path</span>
</code></dt>
<dd>
<div class="desc"><p>Trains a model for use in transcription.</p>
<div class="desc"><p>Fine-tunes a model for use in transcription.</p>
<h2 id="parameters">Parameters</h2>
<p>job: Info about the training job, e.g. training options.
output_dir: Where to save the trained model.
Expand All @@ -82,7 +86,7 @@ <h2 id="returns">Returns</h2>
cache_dir: Optional[Path] = None,
log_file: Optional[Path] = None,
) -&gt; Path:
&#34;&#34;&#34;Trains a model for use in transcription.
&#34;&#34;&#34;Fine-tunes a model for use in transcription.

Parameters:
job: Info about the training job, e.g. training options.
Expand Down Expand Up @@ -125,6 +129,7 @@ <h2 id="returns">Returns</h2>
eval_dataset=dataset[&#34;test&#34;], # type: ignore
tokenizer=processor.feature_extractor,
data_collator=data_collator,
compute_metrics=create_metrics(job.metrics, processor),
)

logger.info(f&#34;Begin training model...&#34;)
Expand All @@ -138,9 +143,9 @@ <h2 id="returns">Returns</h2>
logger.info(f&#34;Model written to disk.&#34;)

metrics = trainer.evaluate()
logger.info(&#34;==== Metrics ====&#34;)
trainer.log_metrics(&#34;eval&#34;, metrics)
trainer.save_metrics(&#34;eval&#34;, metrics)
logger.info(&#34;==== Metrics ====&#34;)
logger.info(metrics)

return output_dir</code></pre>
Expand All @@ -153,7 +158,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="elpis.trainer.TrainingJob"><code class="flex name class">
<span>class <span class="ident">TrainingJob</span></span>
<span>(</span><span>model_name: str, dataset_name: str, options: <a title="elpis.trainer.TrainingOptions" href="#elpis.trainer.TrainingOptions">TrainingOptions</a>, status: <a title="elpis.trainer.TrainingStatus" href="#elpis.trainer.TrainingStatus">TrainingStatus</a> = TrainingStatus.WAITING, base_model: str = 'facebook/wav2vec2-base-960h', sampling_rate: int = 16000)</span>
<span>(</span><span>model_name: str, dataset_name: str, options: <a title="elpis.trainer.TrainingOptions" href="#elpis.trainer.TrainingOptions">TrainingOptions</a>, status: <a title="elpis.trainer.TrainingStatus" href="#elpis.trainer.TrainingStatus">TrainingStatus</a> = TrainingStatus.WAITING, base_model: str = 'facebook/wav2vec2-base-960h', sampling_rate: int = 16000, metrics: Tuple[str, ...] = ('wer', 'cer'))</span>
</code></dt>
<dd>
<div class="desc"><p>A class representing a training job for a model</p></div>
Expand All @@ -171,6 +176,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
status: TrainingStatus = TrainingStatus.WAITING
base_model: str = BASE_MODEL
sampling_rate: int = SAMPLING_RATE
metrics: Tuple[str, ...] = METRICS

def to_training_args(self, output_dir: Path, **kwargs) -&gt; TrainingArguments:
return TrainingArguments(
Expand Down Expand Up @@ -205,6 +211,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
status=TrainingStatus(data.get(&#34;status&#34;, TrainingStatus.WAITING)),
base_model=data.get(&#34;base_model&#34;, BASE_MODEL),
sampling_rate=data.get(&#34;sampling_rate&#34;, SAMPLING_RATE),
metrics=data.get(&#34;metrics&#34;, METRICS),
)

def to_dict(self) -&gt; Dict[str, Any]:
Expand All @@ -222,6 +229,10 @@ <h3>Class variables</h3>
<dd>
<div class="desc"></div>
</dd>
<dt id="elpis.trainer.TrainingJob.metrics"><code class="name">var <span class="ident">metrics</span> : Tuple[str, ...]</code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt id="elpis.trainer.TrainingJob.model_name"><code class="name">var <span class="ident">model_name</span> : str</code></dt>
<dd>
<div class="desc"></div>
Expand Down Expand Up @@ -259,6 +270,7 @@ <h3>Static methods</h3>
status=TrainingStatus(data.get(&#34;status&#34;, TrainingStatus.WAITING)),
base_model=data.get(&#34;base_model&#34;, BASE_MODEL),
sampling_rate=data.get(&#34;sampling_rate&#34;, SAMPLING_RATE),
metrics=data.get(&#34;metrics&#34;, METRICS),
)</code></pre>
</details>
</dd>
Expand Down Expand Up @@ -477,6 +489,7 @@ <h1>Index</h1>
<ul>
<li><code><a title="elpis.trainer.data_collator" href="data_collator.html">elpis.trainer.data_collator</a></code></li>
<li><code><a title="elpis.trainer.job" href="job.html">elpis.trainer.job</a></code></li>
<li><code><a title="elpis.trainer.metrics" href="metrics.html">elpis.trainer.metrics</a></code></li>
<li><code><a title="elpis.trainer.trainer" href="trainer.html">elpis.trainer.trainer</a></code></li>
<li><code><a title="elpis.trainer.utils" href="utils.html">elpis.trainer.utils</a></code></li>
</ul>
Expand All @@ -494,6 +507,7 @@ <h4><code><a title="elpis.trainer.TrainingJob" href="#elpis.trainer.TrainingJob"
<li><code><a title="elpis.trainer.TrainingJob.base_model" href="#elpis.trainer.TrainingJob.base_model">base_model</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.dataset_name" href="#elpis.trainer.TrainingJob.dataset_name">dataset_name</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.from_dict" href="#elpis.trainer.TrainingJob.from_dict">from_dict</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.metrics" href="#elpis.trainer.TrainingJob.metrics">metrics</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.model_name" href="#elpis.trainer.TrainingJob.model_name">model_name</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.options" href="#elpis.trainer.TrainingJob.options">options</a></code></li>
<li><code><a title="elpis.trainer.TrainingJob.sampling_rate" href="#elpis.trainer.TrainingJob.sampling_rate">sampling_rate</a></code></li>
Expand Down
15 changes: 13 additions & 2 deletions trainer/job.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ <h1 class="title">Module <code>elpis.trainer.job</code></h1>
from dataclasses import dataclass, fields
from enum import Enum
from pathlib import Path
from typing import Any, Dict
from typing import Any, Dict, Tuple

import torch
from transformers import TrainingArguments

BASE_MODEL = &#34;facebook/wav2vec2-base-960h&#34;
SAMPLING_RATE = 16_000
METRICS = (&#34;wer&#34;, &#34;cer&#34;)


class TrainingStatus(Enum):
Expand Down Expand Up @@ -80,6 +81,7 @@ <h1 class="title">Module <code>elpis.trainer.job</code></h1>
status: TrainingStatus = TrainingStatus.WAITING
base_model: str = BASE_MODEL
sampling_rate: int = SAMPLING_RATE
metrics: Tuple[str, ...] = METRICS

def to_training_args(self, output_dir: Path, **kwargs) -&gt; TrainingArguments:
return TrainingArguments(
Expand Down Expand Up @@ -114,6 +116,7 @@ <h1 class="title">Module <code>elpis.trainer.job</code></h1>
status=TrainingStatus(data.get(&#34;status&#34;, TrainingStatus.WAITING)),
base_model=data.get(&#34;base_model&#34;, BASE_MODEL),
sampling_rate=data.get(&#34;sampling_rate&#34;, SAMPLING_RATE),
metrics=data.get(&#34;metrics&#34;, METRICS),
)

def to_dict(self) -&gt; Dict[str, Any]:
Expand All @@ -133,7 +136,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="elpis.trainer.job.TrainingJob"><code class="flex name class">
<span>class <span class="ident">TrainingJob</span></span>
<span>(</span><span>model_name: str, dataset_name: str, options: <a title="elpis.trainer.job.TrainingOptions" href="#elpis.trainer.job.TrainingOptions">TrainingOptions</a>, status: <a title="elpis.trainer.job.TrainingStatus" href="#elpis.trainer.job.TrainingStatus">TrainingStatus</a> = TrainingStatus.WAITING, base_model: str = 'facebook/wav2vec2-base-960h', sampling_rate: int = 16000)</span>
<span>(</span><span>model_name: str, dataset_name: str, options: <a title="elpis.trainer.job.TrainingOptions" href="#elpis.trainer.job.TrainingOptions">TrainingOptions</a>, status: <a title="elpis.trainer.job.TrainingStatus" href="#elpis.trainer.job.TrainingStatus">TrainingStatus</a> = TrainingStatus.WAITING, base_model: str = 'facebook/wav2vec2-base-960h', sampling_rate: int = 16000, metrics: Tuple[str, ...] = ('wer', 'cer'))</span>
</code></dt>
<dd>
<div class="desc"><p>A class representing a training job for a model</p></div>
Expand All @@ -151,6 +154,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
status: TrainingStatus = TrainingStatus.WAITING
base_model: str = BASE_MODEL
sampling_rate: int = SAMPLING_RATE
metrics: Tuple[str, ...] = METRICS

def to_training_args(self, output_dir: Path, **kwargs) -&gt; TrainingArguments:
return TrainingArguments(
Expand Down Expand Up @@ -185,6 +189,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
status=TrainingStatus(data.get(&#34;status&#34;, TrainingStatus.WAITING)),
base_model=data.get(&#34;base_model&#34;, BASE_MODEL),
sampling_rate=data.get(&#34;sampling_rate&#34;, SAMPLING_RATE),
metrics=data.get(&#34;metrics&#34;, METRICS),
)

def to_dict(self) -&gt; Dict[str, Any]:
Expand All @@ -202,6 +207,10 @@ <h3>Class variables</h3>
<dd>
<div class="desc"></div>
</dd>
<dt id="elpis.trainer.job.TrainingJob.metrics"><code class="name">var <span class="ident">metrics</span> : Tuple[str, ...]</code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt id="elpis.trainer.job.TrainingJob.model_name"><code class="name">var <span class="ident">model_name</span> : str</code></dt>
<dd>
<div class="desc"></div>
Expand Down Expand Up @@ -239,6 +248,7 @@ <h3>Static methods</h3>
status=TrainingStatus(data.get(&#34;status&#34;, TrainingStatus.WAITING)),
base_model=data.get(&#34;base_model&#34;, BASE_MODEL),
sampling_rate=data.get(&#34;sampling_rate&#34;, SAMPLING_RATE),
metrics=data.get(&#34;metrics&#34;, METRICS),
)</code></pre>
</details>
</dd>
Expand Down Expand Up @@ -461,6 +471,7 @@ <h4><code><a title="elpis.trainer.job.TrainingJob" href="#elpis.trainer.job.Trai
<li><code><a title="elpis.trainer.job.TrainingJob.base_model" href="#elpis.trainer.job.TrainingJob.base_model">base_model</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.dataset_name" href="#elpis.trainer.job.TrainingJob.dataset_name">dataset_name</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.from_dict" href="#elpis.trainer.job.TrainingJob.from_dict">from_dict</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.metrics" href="#elpis.trainer.job.TrainingJob.metrics">metrics</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.model_name" href="#elpis.trainer.job.TrainingJob.model_name">model_name</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.options" href="#elpis.trainer.job.TrainingJob.options">options</a></code></li>
<li><code><a title="elpis.trainer.job.TrainingJob.sampling_rate" href="#elpis.trainer.job.TrainingJob.sampling_rate">sampling_rate</a></code></li>
Expand Down
Loading

0 comments on commit a441dbb

Please sign in to comment.