/**
* AudioClip encapsulates an audio recording.
*
* @author Alain Pitiot and Sotiri Bakagiannis
* @version 2022.2.3
* @copyright (c) 2021 Open Science Tools Ltd. (https://opensciencetools.org)
* @license Distributed under the terms of the MIT License
*/
import { PsychoJS } from "../core/PsychoJS.js";
import { ExperimentHandler } from "../data/ExperimentHandler.js";
import { PsychObject } from "../util/PsychObject.js";
import * as util from "../util/Util.js";
/**
* <p>AudioClip encapsulates an audio recording.</p>
*
* @extends PsychObject
*/
export class AudioClip extends PsychObject
{
/**
* @memberOf module:sound
* @param {Object} options
* @param {module:core.PsychoJS} options.psychoJS - the PsychoJS instance
* @param {String} [options.name= 'audioclip'] - the name used when logging messages
* @param {string} options.format the format for the audio file
* @param {number} options.sampleRateHz - the sampling rate
* @param {Blob} options.data - the audio data, in the given format, at the given sampling rate
* @param {boolean} [options.autoLog= false] - whether or not to log
*/
constructor({ psychoJS, name, sampleRateHz, format, data, autoLog } = {})
{
super(psychoJS);
this._addAttribute("name", name, "audioclip");
this._addAttribute("format", format);
this._addAttribute("sampleRateHz", sampleRateHz);
this._addAttribute("data", data);
this._addAttribute("autoLog", false, autoLog);
this._addAttribute("status", AudioClip.Status.CREATED);
// add a volume attribute, for playback:
this._addAttribute("volume", 1.0);
if (this._autoLog)
{
this._psychoJS.experimentLogger.exp(`Created ${this.name} = ${this.toString()}`);
}
// decode the blob into an audio buffer:
this._decodeAudio();
}
/**
* Set the volume of the playback.
*
* @param {number} volume - the volume of the playback (must be between 0.0 and 1.0)
*/
setVolume(volume)
{
this._volume = volume;
}
/**
* Start playing the audio clip.
*
* @public
*/
async startPlayback()
{
this._psychoJS.logger.debug("request to play the audio clip");
// wait for the decoding to complete:
await this._decodeAudio();
// note: we need to prepare the audio graph anew each time since, for instance, an
// AudioBufferSourceNode can only be played once
// ref: https://developer.mozilla.org/en-US/docs/Web/API/AudioBufferSourceNode
// create a source node from the in-memory audio data in _audioBuffer:
this._source = this._audioContext.createBufferSource();
this._source.buffer = this._audioBuffer;
// create a gain node, so we can control the volume:
this._gainNode = this._audioContext.createGain();
// connect the nodes:
this._source.connect(this._gainNode);
this._gainNode.connect(this._audioContext.destination);
// set the volume:
this._gainNode.gain.value = this._volume;
// start the playback:
this._source.start();
}
/**
* Stop playing the audio clip.
*
* @param {number} [fadeDuration = 17] - how long the fading out should last, in ms
*/
async stopPlayback(fadeDuration = 17)
{
// TODO deal with fade duration
// stop the playback:
this._source.stop();
}
/**
* Get the duration of the audio clip, in seconds.
*
* @returns {Promise<number>} the duration of the audio clip
*/
async getDuration()
{
// wait for the decoding to complete:
await this._decodeAudio();
return this._audioBuffer.duration;
}
/**
* Upload the audio clip to the pavlovia server.
*
* @public
*/
upload()
{
this._psychoJS.logger.debug("request to upload the audio clip to pavlovia.org");
// add a format-dependent audio extension to the name:
const filename = this._name + util.extensionFromMimeType(this._format);
// if the audio recording cannot be uploaded, e.g. the experiment is running locally, or
// if it is piloting mode, then we offer the audio clip as a file for download:
if (
this._psychoJS.getEnvironment() !== ExperimentHandler.Environment.SERVER
|| this._psychoJS.config.experiment.status !== "RUNNING"
|| this._psychoJS._serverMsg.has("__pilotToken")
)
{
return this.download(filename);
}
// upload the data:
return this._psychoJS.serverManager.uploadAudioVideo({
mediaBlob: this._data,
tag: filename
});
}
/**
* Offer the audio clip to the participant as a sound file to download.
*/
download(filename = "audio.webm")
{
const anchor = document.createElement("a");
anchor.href = window.URL.createObjectURL(this._data);
anchor.download = filename;
document.body.appendChild(anchor);
anchor.click();
document.body.removeChild(anchor);
}
/**
* Transcribe the audio clip.
*
* @param {Object} options
* @param {Symbol} options.engine - the speech-to-text engine
* @param {String} options.languageCode - the BCP-47 language code for the recognition,
* e.g. 'en-GB'
* @return {Promise} a promise resolving to the transcript and associated
* transcription confidence
*/
async transcribe({ engine, languageCode } = {})
{
const response = {
origin: "AudioClip.transcribe",
context: `when transcribing audio clip: ${this._name}`,
};
this._psychoJS.logger.debug(response);
// get the secret key from the experiment configuration:
const fullEngineName = `sound.AudioClip.Engine.${Symbol.keyFor(engine)}`;
let transcriptionKey;
for (const key of this._psychoJS.config.experiment.keys)
{
if (key.name === fullEngineName)
{
transcriptionKey = key.value;
}
}
if (typeof transcriptionKey === "undefined")
{
throw {
...response,
error: `missing key for engine: ${fullEngineName}`,
};
}
// wait for the decoding to complete:
await this._decodeAudio();
// dispatch on engine:
if (engine === AudioClip.Engine.GOOGLE)
{
return this._GoogleTranscribe(transcriptionKey, languageCode);
}
else
{
throw {
...response,
error: `unsupported speech-to-text engine: ${engine}`,
};
}
}
/**
* Transcribe the audio clip using the Google Cloud Speech-To-Text Engine.
*
* ref: https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize
*
* @protected
* @param {String} transcriptionKey - the secret key to the Google service
* @param {String} languageCode - the BCP-47 language code for the recognition, e.g. 'en-GB'
* @return {Promise} a promise resolving to the transcript and associated
* transcription confidence
*/
_GoogleTranscribe(transcriptionKey, languageCode)
{
return new Promise(async (resolve, reject) =>
{
// convert the Float32 PCM audio data to UInt16:
const buffer = new ArrayBuffer(this._audioData.length * 2);
const uint16View = new Uint16Array(buffer);
for (let t = 0; t < this._audioData.length; ++t)
{
uint16View[t] = (this._audioData[t] < 0)
? this._audioData[t] * 0x8000
: this._audioData[t] * 0x7FFF;
}
// encode it to base64:
const base64Data = this._base64ArrayBuffer(new Uint8Array(buffer));
// query the Google speech-to-text service:
const body = {
config: {
encoding: "LINEAR16",
sampleRateHertz: this._sampleRateHz,
languageCode,
},
audio: {
content: base64Data,
},
};
const url = `https://speech.googleapis.com/v1/speech:recognize?key=${transcriptionKey}`;
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
// convert the response to json:
const decodedResponse = await response.json();
this._psychoJS.logger.debug("speech.googleapis.com response:", JSON.stringify(decodedResponse));
// TODO deal with more than one results and/or alternatives
if (("results" in decodedResponse) && (decodedResponse.results.length > 0))
{
resolve(decodedResponse.results[0].alternatives[0]);
}
else
{
// no transcription available:
resolve({
transcript: "",
confidence: -1,
});
}
});
}
/**
* Decode the formatted audio data (e.g. webm) into a 32bit float PCM audio buffer.
*
* @protected
*/
_decodeAudio()
{
this._psychoJS.logger.debug("request to decode the data of the audio clip");
// if the audio clip is ready, the PCM audio data is available in _audioData, a Float32Array:
if (this._status === AudioClip.Status.READY)
{
return;
}
// if we are already decoding, wait until the process completed:
if (this._status === AudioClip.Status.DECODING)
{
const self = this;
return new Promise(function(resolve, reject)
{
self._decodingCallbacks.push(resolve);
// self._errorCallback = reject; // TODO
}.bind(this));
}
// otherwise, start decoding the input formatted audio data:
this._status = AudioClip.Status.DECODING;
this._audioData = null;
this._source = null;
this._gainNode = null;
this._decodingCallbacks = [];
this._audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: this._sampleRateHz,
});
const reader = new window.FileReader();
reader.onloadend = async () =>
{
try
{
// decode the ArrayBuffer containing the formatted audio data (e.g. webm)
// into an audio buffer:
this._audioBuffer = await this._audioContext.decodeAudioData(reader.result);
// get the Float32Array containing the PCM data:
this._audioData = this._audioBuffer.getChannelData(0);
// we are now ready to translate and play:
this._status = AudioClip.Status.READY;
// resolve all the promises waiting for the decoding to complete:
for (const callback of this._decodingCallbacks)
{
callback();
}
}
catch (error)
{
console.error(error);
// TODO
}
};
reader.onerror = (error) =>
{
// TODO
};
reader.readAsArrayBuffer(this._data);
}
/**
* Convert an array buffer to a base64 string.
*
* @note this is heavily inspired by the following post by @Grantlyk:
* https://gist.github.com/jonleighton/958841#gistcomment-1953137
* It is necessary since the following approach only works for small buffers:
* const dataAsString = String.fromCharCode.apply(null, new Uint8Array(buffer));
* base64Data = window.btoa(dataAsString);
*
* @protected
* @param arrayBuffer - the input buffer
* @return {string} the base64 encoded input buffer
*/
_base64ArrayBuffer(arrayBuffer)
{
let base64 = "";
const encodings = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const bytes = new Uint8Array(arrayBuffer);
const byteLength = bytes.byteLength;
const byteRemainder = byteLength % 3;
const mainLength = byteLength - byteRemainder;
let a;
let b;
let c;
let d;
let chunk;
// Main loop deals with bytes in chunks of 3
for (let i = 0; i < mainLength; i += 3)
{
// Combine the three bytes into a single integer
chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2];
// Use bitmasks to extract 6-bit segments from the triplet
a = (chunk & 16515072) >> 18; // 16515072 = (2^6 - 1) << 18
b = (chunk & 258048) >> 12; // 258048 = (2^6 - 1) << 12
c = (chunk & 4032) >> 6; // 4032 = (2^6 - 1) << 6
d = chunk & 63; // 63 = 2^6 - 1
// Convert the raw binary segments to the appropriate ASCII encoding
base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d];
}
// Deal with the remaining bytes and padding
if (byteRemainder === 1)
{
chunk = bytes[mainLength];
a = (chunk & 252) >> 2; // 252 = (2^6 - 1) << 2
// Set the 4 least significant bits to zero
b = (chunk & 3) << 4; // 3 = 2^2 - 1
base64 += `${encodings[a]}${encodings[b]}==`;
}
else if (byteRemainder === 2)
{
chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1];
a = (chunk & 64512) >> 10; // 64512 = (2^6 - 1) << 10
b = (chunk & 1008) >> 4; // 1008 = (2^6 - 1) << 4
// Set the 2 least significant bits to zero
c = (chunk & 15) << 2; // 15 = 2^4 - 1
base64 += `${encodings[a]}${encodings[b]}${encodings[c]}=`;
}
return base64;
}
}
/**
* Recognition engines.
*
* @enum {Symbol}
* @readonly
*/
AudioClip.Engine = {
/**
* Google Cloud Speech-to-Text.
*/
GOOGLE: Symbol.for("GOOGLE"),
};
/**
* AudioClip status.
*
* @enum {Symbol}
* @readonly
*/
AudioClip.Status = {
CREATED: Symbol.for("CREATED"),
DECODING: Symbol.for("DECODING"),
READY: Symbol.for("READY"),
};