Skip to content

Commit

Permalink
punctuation for microsoft
Browse files Browse the repository at this point in the history
  • Loading branch information
xquanluu authored and davehorton committed Dec 18, 2023
1 parent bcb4bf4 commit 1fa05b5
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
3 changes: 2 additions & 1 deletion lib/tasks/gather.js
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,8 @@ class TaskGather extends SttTask {
return;
}

evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language,
this.shortUtterance, this.data.recognizer.punctuation);
if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;
Expand Down
3 changes: 2 additions & 1 deletion lib/tasks/transcribe.js
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ class TaskTranscribe extends SttTask {
}
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');

evt = this.normalizeTranscription(evt, this.vendor, channel, this.language);
evt = this.normalizeTranscription(evt, this.vendor, channel, this.language, undefined,
this.data.recognizer.punctuation);
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
Expand Down
9 changes: 5 additions & 4 deletions lib/utils/transcription-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,15 @@ const normalizeNuance = (evt, channel, language) => {
};
};

const normalizeMicrosoft = (evt, channel, language) => {
const normalizeMicrosoft = (evt, channel, language, punctuation) => {
const copy = JSON.parse(JSON.stringify(evt));
const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
// remove all puntuation if needed
transcript: punctuation ? n.Display : n.Display.replace(/\p{P}/gu, '')
};
}) :
[
Expand Down Expand Up @@ -400,14 +401,14 @@ const normalizeAssemblyAi = (evt, channel, language) => {
};

module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => {

//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
switch (vendor) {
case 'deepgram':
return normalizeDeepgram(evt, channel, language, shortUtterance);
case 'microsoft':
return normalizeMicrosoft(evt, channel, language);
return normalizeMicrosoft(evt, channel, language, punctuation);
case 'google':
return normalizeGoogle(evt, channel, language);
case 'aws':
Expand Down

0 comments on commit 1fa05b5

Please sign in to comment.