medkit-lib
diff --git a/docs/_templates/autosummary/module.rst b/docs/_templates/autosummary/module.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+{{ fullname | escape | underline}}
+
+{% if not modules %}
+.. automodule:: {{ fullname }}
+    :members:
+    :inherited-members: dict
+    :autosummary:
+    :autosummary-members:
+    :autosummary-inherited-members: dict
+{% endif %}
+
+{% block members %}
+{% if members and modules %}
+APIs
+----
+
+For accessing these APIs, you may use import like this:
+
+.. code-block:: python
+
+    from {{ fullname }} import <api_to_import>
+
+.. automodule:: {{ fullname }}
+    :members: {% for item in members %} {{ item }}, {%- endfor %}
+    :inherited-members: dict
+    :autosummary:
+    :autosummary-members:
+    :autosummary-inherited-members: dict
+    :autosummary-no-nesting:
+{% endif %}
+{% endblock %}
+
+{% block modules %}
+{% if modules %}
+Subpackages / Submodules
+------------------------
+.. autosummary::
+   :toctree:
+   :recursive:
+{% for item in modules %}
+   {{ item }}
+{%- endfor %}
+{% endif %}
+{% endblock %}
diff --git a/docs/examples/audio_dataset_metrics.md b/docs/examples/audio_dataset_metrics.md
@@ -0,0 +1,167 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.14.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Computing metrics on an audio dataset
+
+This demo shows how to compute diarization and transcription metrics on an audio
+dataset such as [simsamu](https://huggingface.co/datasets/medkit/simsamu)
+
+Download the dataset from the HuggingFace hub:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+import huggingface_hub as hf_hub
+from medkit.io import SRTInputConverter
+
+simsamu_dir = hf_hub.snapshot_download("medkit/simsamu", repo_type="dataset")
+```
+
+Load the `.m4a` audio files into audio documents, as well as reference
+diarization and transcription annotated documents from corresponding `.rttm` and
+`.srt` files:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from pathlib import Path
+from medkit.core.audio import AudioDocument
+from medkit.io.rttm import RTTMInputConverter
+from medkit.io.srt import SRTInputConverter
+
+# init input converters for .rttm and .srt files
+rttm_converter = RTTMInputConverter(turn_label="speech", speaker_label="speaker")
+srt_converter = SRTInputConverter(turn_segment_label="speech", transcription_attr_label="transcription")
+
+docs = []
+ref_docs_diar = []
+ref_docs_transcript = []
+
+for rec_dir in sorted(Path(simsamu_dir).glob("*"))[:4]:
+    # iterate only on subdirs
+    if not rec_dir.is_dir():
+        continue
+    
+    # locate audio, .rttm and .srt files
+    m4a_file = next(rec_dir.glob("*.m4a"))
+    rttm_file = next(rec_dir.glob("*.rttm"))
+    srt_file = next(rec_dir.glob("*.srt"))
+
+    # convert m4a to wav with ffmpeg
+    wav_file = m4a_file.with_suffix(".wav")
+    if not wav_file.exists():
+        !ffmpeg -i {m4a_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file}
+
+    # load empty audio doc
+    doc = AudioDocument.from_file(wav_file)
+    docs.append(doc)
+    # load reference audio doc with diarization annotations
+    ref_doc_diar = rttm_converter.load_doc(rttm_file=rttm_file, audio_file=wav_file)
+    ref_docs_diar.append(ref_doc_diar)
+    # load reference audio doc with transcription annotations
+    ref_doc_transcript = srt_converter.load_doc(srt_file=srt_file, audio_file=wav_file)
+    ref_docs_transcript.append(ref_doc_transcript)
+```
+
+Initialize the diarization operation with the [simsamu-diarization pipeline](https://huggingface.co/medkit/simsamu-diarization)
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+import torch
+from medkit.audio.segmentation.pa_speaker_detector import PASpeakerDetector
+
+device = 0 if torch.cuda.is_available() else -1
+
+speaker_detector = PASpeakerDetector(
+    model="medkit/simsamu-diarization",
+    output_label="speech",
+    min_nb_speakers=1,
+    max_nb_speakers=2,
+    device=device,
+    segmentation_batch_size=10,
+    embedding_batch_size=10,
+)
+```
+
+Initialize the transcription operation with the [simsamu-transcription model](https://huggingface.co/medkit/simsamu-transcription):
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from medkit.audio.transcription.sb_transcriber import SBTranscriber
+
+transcriber = SBTranscriber(
+    model="medkit/simsamu-transcription",
+    needs_decoder=False,
+    output_label="transcription",
+    device=device,
+    batch_size=10,
+)
+```
+
+Diarize and transcribe all documents:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from tqdm import tqdm
+
+# list of list of segments, per document
+# (this structure is needed to compute the metrics)
+all_speech_segs = []
+
+for doc in tqdm(docs):
+    speech_segs = speaker_detector.run([doc.raw_segment])
+    transcriber.run(speech_segs)
+    all_speech_segs.append(speech_segs)
+```
+
+Compute the DER (Diarization Error Rate):
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from medkit.audio.metrics.diarization import DiarizationEvaluator
+
+diarization_evaluator = DiarizationEvaluator(
+    turn_label="speech",
+    speaker_label="speaker",
+    collar=0.5,
+)
+
+results = diarization_evaluator.compute(ref_docs_diar, all_speech_segs)
+print(f"der={results.der:.2%}")
+```
+
+```
+der=13.45%
+```
+
+Compute the WER (Word Error Rate) and CER (Character Error Rate):
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from medkit.audio.metrics.transcription import TranscriptionEvaluator
+
+transcription_evaluator = TranscriptionEvaluator(
+    speech_label="speech",
+    transcription_label="transcription",
+)
+
+results = transcription_evaluator.compute(ref_docs_transcript, all_speech_segs)
+print(f"wer={results.wer:.2%}, cer={results.cer:.2%}")
+```
+
+```
+wer=20.77%, cer=15.13%
+```
+
+Note that running the transcription operation on the reference speech turns
+rather than those returned by the diarization operation will give lower WER and
+CER values (around 15% and 9%).
diff --git a/docs/examples/audio_transcription.md b/docs/examples/audio_transcription.md
@@ -0,0 +1,179 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.14.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Audio transcription
+
+This demo shows how to transcribe an audio document and then perform text
+operations on it.
+
+## Init audio document
+
+Instantiate an {class}`~.core.audio.AudioDocument` with a
+{class}`~.core.audio.FileAudioBuffer`:
+
+```{code-cell} ipython3
+from pathlib import Path
+import IPython.display
+from medkit.core.audio import AudioDocument, FileAudioBuffer
+
+audio_file = Path("input/voice.ogg")
+audio_doc = AudioDocument(audio=FileAudioBuffer(audio_file))
+
+IPython.display.Audio(data=audio_doc.audio.read(), rate=audio_doc.audio.sample_rate)
+```
+
+## Voice detection
+
+Prepare pipeline to perform voice detection on audio documents, using a
+{class}`~.audio.preprocessing.Downmixer` chained with a
+{class}`~.audio.segmentation.webrtc_voice_detector.WebRTCVoiceDetector` (you can
+also use other segmentation operations such as
+{class}`~.audio.segmentation.pa_speaker_detector.PASpeakerDetector` ):
+
+```{code-cell} ipython3
+from medkit.core import Pipeline, PipelineStep, DocPipeline
+from medkit.audio.preprocessing import Downmixer
+from medkit.audio.segmentation.webrtc_voice_detector import WebRTCVoiceDetector
+
+# init operations
+downmixer = Downmixer(output_label="mono")
+voice_detector = WebRTCVoiceDetector(output_label="voice")
+
+# put them in a pipeline
+audio_pipeline = Pipeline(
+    steps=[
+        PipelineStep(
+            downmixer,
+            input_keys=["full_audio"],
+            output_keys=["full_mono_audio"],
+        ),
+        PipelineStep(
+            voice_detector,
+            input_keys=["full_mono_audio"],
+            output_keys=["voice_segs"],
+        ),
+    ],
+    input_keys=["full_audio"],
+    output_keys=["voice_segs"],
+)
+
+# wrap pipeline in doc-level pipeline
+audio_doc_pipeline = DocPipeline(audio_pipeline)
+```
+
+Run voice detection on audio document:
+
+```{code-cell} ipython3
+audio_doc_pipeline.run([audio_doc])
+for seg in audio_doc.anns.get(label="voice"):
+    print(f"label={seg.label}, span={seg.span}")
+```
+
+## Transcription
+
+Prepare a {class}`~.audio.transcription.DocTranscriber` that will convert audio
+documents to text documents, using
+{class}`~.audio.transcription.hf_transcriber.HFTranscriber` as the actual audio
+transcriber creating text segments from audio segments (you can also use other
+transcription operations such as
+{class}`~.audio.transcription.sb_transcriber.SBTranscriber`):
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from medkit.audio.transcription import DocTranscriber
+from medkit.audio.transcription.hf_transcriber import HFTranscriber
+
+transcriber = HFTranscriber(
+    model="openai/whisper-small",
+    language="english",
+    add_trailing_dot=False,
+    capitalize=False,
+)
+doc_transcriber = DocTranscriber(
+    input_label="voice",
+    output_label="transcription",
+    transcription_operation=transcriber,
+)
+```
+
+Transcribe audio document:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+transcribed_doc = doc_transcriber.run([audio_doc])[0]
+print(f"fulltext={transcribed_doc.text!r}", end="\n\n")
+for seg in transcribed_doc.anns.get(label="transcription"):
+    print(f"label={seg.label}, text={seg.text!r}")
+```
+
+```
+fulltext=' I have headaches.\n I also have high blood pressure.'
+
+label=transcription, text=' I have headaches.'
+label=transcription, text=' I also have high blood pressure.'
+```
+
+## Entity matching on text
+
+Run text entity matching on transcribed document:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+from medkit.core.text import TextDocument
+from medkit.text.ner import RegexpMatcher, RegexpMatcherRule
+
+rules = [
+    RegexpMatcherRule(label="problem", regexp=r"\bheadaches?\b"),
+    RegexpMatcherRule(label="problem", regexp=r"\bhigh\s+blood\s+pressure\b"),
+]
+matcher = RegexpMatcher(rules)
+text_pipeline = Pipeline(
+    steps=[PipelineStep(matcher, input_keys=["full_text"], output_keys=["entities"])],
+    input_keys=["full_text"],
+    output_keys=["entities"]
+)
+text_doc_pipeline = DocPipeline(
+    text_pipeline,
+    labels_by_input_key={"full_text": [TextDocument.RAW_LABEL]},
+)
+text_doc_pipeline.run([transcribed_doc])
+```
+
+Locate matched entities in original audio:
+
+```{code-cell} ipython3
+:tags: [skip-execution]
+entities = transcribed_doc.anns.get_entities()
+
+for entity in entities:
+    print(f"label={entity.label}, text={entity.text!r}")
+    audio_spans = transcribed_doc.get_containing_audio_spans(entity.spans)
+    print(f"audio_spans={audio_spans}", end="\n\n")
+
+    audio = audio_doc.audio.trim_duration(audio_spans[0].start, audio_spans[0].end)
+    IPython.display.display(IPython.display.Audio(data=audio.read(), rate=audio.sample_rate))
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+# hardcoded display of audio spans to workaround
+# the fact that cells are not executed
+print("label=problem, text='headaches'")
+entity_1_audio = audio_doc.audio.trim_duration(0.99, 2.73)
+IPython.display.display(IPython.display.Audio(data=entity_1_audio.read(), rate=entity_1_audio.sample_rate))
+
+print("label=problem, text='high blood pressure'")
+entity_2_audio = audio_doc.audio.trim_duration(6.0, 8.73)
+IPython.display.display(IPython.display.Audio(data=entity_2_audio.read(), rate=entity_2_audio.sample_rate))
+```