sound/AudioClip.js

  1. /**
  2. * AudioClip encapsulates an audio recording.
  3. *
  4. * @author Alain Pitiot and Sotiri Bakagiannis
  5. * @version 2022.2.3
  6. * @copyright (c) 2021 Open Science Tools Ltd. (https://opensciencetools.org)
  7. * @license Distributed under the terms of the MIT License
  8. */
  9. import { PsychoJS } from "../core/PsychoJS.js";
  10. import { ExperimentHandler } from "../data/ExperimentHandler.js";
  11. import { PsychObject } from "../util/PsychObject.js";
  12. import * as util from "../util/Util.js";
  13. /**
  14. * <p>AudioClip encapsulates an audio recording.</p>
  15. *
  16. * @extends PsychObject
  17. */
  18. export class AudioClip extends PsychObject
  19. {
  20. /**
  21. * @memberOf module:sound
  22. * @param {Object} options
  23. * @param {module:core.PsychoJS} options.psychoJS - the PsychoJS instance
  24. * @param {String} [options.name= 'audioclip'] - the name used when logging messages
  25. * @param {string} options.format the format for the audio file
  26. * @param {number} options.sampleRateHz - the sampling rate
  27. * @param {Blob} options.data - the audio data, in the given format, at the given sampling rate
  28. * @param {boolean} [options.autoLog= false] - whether or not to log
  29. */
  30. constructor({ psychoJS, name, sampleRateHz, format, data, autoLog } = {})
  31. {
  32. super(psychoJS);
  33. this._addAttribute("name", name, "audioclip");
  34. this._addAttribute("format", format);
  35. this._addAttribute("sampleRateHz", sampleRateHz);
  36. this._addAttribute("data", data);
  37. this._addAttribute("autoLog", false, autoLog);
  38. this._addAttribute("status", AudioClip.Status.CREATED);
  39. // add a volume attribute, for playback:
  40. this._addAttribute("volume", 1.0);
  41. if (this._autoLog)
  42. {
  43. this._psychoJS.experimentLogger.exp(`Created ${this.name} = ${this.toString()}`);
  44. }
  45. // decode the blob into an audio buffer:
  46. this._decodeAudio();
  47. }
  48. /**
  49. * Set the volume of the playback.
  50. *
  51. * @param {number} volume - the volume of the playback (must be between 0.0 and 1.0)
  52. */
  53. setVolume(volume)
  54. {
  55. this._volume = volume;
  56. }
  57. /**
  58. * Start playing the audio clip.
  59. *
  60. * @public
  61. */
  62. async startPlayback()
  63. {
  64. this._psychoJS.logger.debug("request to play the audio clip");
  65. // wait for the decoding to complete:
  66. await this._decodeAudio();
  67. // note: we need to prepare the audio graph anew each time since, for instance, an
  68. // AudioBufferSourceNode can only be played once
  69. // ref: https://developer.mozilla.org/en-US/docs/Web/API/AudioBufferSourceNode
  70. // create a source node from the in-memory audio data in _audioBuffer:
  71. this._source = this._audioContext.createBufferSource();
  72. this._source.buffer = this._audioBuffer;
  73. // create a gain node, so we can control the volume:
  74. this._gainNode = this._audioContext.createGain();
  75. // connect the nodes:
  76. this._source.connect(this._gainNode);
  77. this._gainNode.connect(this._audioContext.destination);
  78. // set the volume:
  79. this._gainNode.gain.value = this._volume;
  80. // start the playback:
  81. this._source.start();
  82. }
  83. /**
  84. * Stop playing the audio clip.
  85. *
  86. * @param {number} [fadeDuration = 17] - how long the fading out should last, in ms
  87. */
  88. async stopPlayback(fadeDuration = 17)
  89. {
  90. // TODO deal with fade duration
  91. // stop the playback:
  92. this._source.stop();
  93. }
  94. /**
  95. * Get the duration of the audio clip, in seconds.
  96. *
  97. * @returns {Promise<number>} the duration of the audio clip
  98. */
  99. async getDuration()
  100. {
  101. // wait for the decoding to complete:
  102. await this._decodeAudio();
  103. return this._audioBuffer.duration;
  104. }
  105. /**
  106. * Upload the audio clip to the pavlovia server.
  107. *
  108. * @public
  109. */
  110. upload()
  111. {
  112. this._psychoJS.logger.debug("request to upload the audio clip to pavlovia.org");
  113. // add a format-dependent audio extension to the name:
  114. const filename = this._name + util.extensionFromMimeType(this._format);
  115. // if the audio recording cannot be uploaded, e.g. the experiment is running locally, or
  116. // if it is piloting mode, then we offer the audio clip as a file for download:
  117. if (
  118. this._psychoJS.getEnvironment() !== ExperimentHandler.Environment.SERVER
  119. || this._psychoJS.config.experiment.status !== "RUNNING"
  120. || this._psychoJS._serverMsg.has("__pilotToken")
  121. )
  122. {
  123. return this.download(filename);
  124. }
  125. // upload the data:
  126. return this._psychoJS.serverManager.uploadAudioVideo({
  127. mediaBlob: this._data,
  128. tag: filename
  129. });
  130. }
  131. /**
  132. * Offer the audio clip to the participant as a sound file to download.
  133. */
  134. download(filename = "audio.webm")
  135. {
  136. const anchor = document.createElement("a");
  137. anchor.href = window.URL.createObjectURL(this._data);
  138. anchor.download = filename;
  139. document.body.appendChild(anchor);
  140. anchor.click();
  141. document.body.removeChild(anchor);
  142. }
  143. /**
  144. * Transcribe the audio clip.
  145. *
  146. * @param {Object} options
  147. * @param {Symbol} options.engine - the speech-to-text engine
  148. * @param {String} options.languageCode - the BCP-47 language code for the recognition,
  149. * e.g. 'en-GB'
  150. * @return {Promise} a promise resolving to the transcript and associated
  151. * transcription confidence
  152. */
  153. async transcribe({ engine, languageCode } = {})
  154. {
  155. const response = {
  156. origin: "AudioClip.transcribe",
  157. context: `when transcribing audio clip: ${this._name}`,
  158. };
  159. this._psychoJS.logger.debug(response);
  160. // get the secret key from the experiment configuration:
  161. const fullEngineName = `sound.AudioClip.Engine.${Symbol.keyFor(engine)}`;
  162. let transcriptionKey;
  163. for (const key of this._psychoJS.config.experiment.keys)
  164. {
  165. if (key.name === fullEngineName)
  166. {
  167. transcriptionKey = key.value;
  168. }
  169. }
  170. if (typeof transcriptionKey === "undefined")
  171. {
  172. throw {
  173. ...response,
  174. error: `missing key for engine: ${fullEngineName}`,
  175. };
  176. }
  177. // wait for the decoding to complete:
  178. await this._decodeAudio();
  179. // dispatch on engine:
  180. if (engine === AudioClip.Engine.GOOGLE)
  181. {
  182. return this._GoogleTranscribe(transcriptionKey, languageCode);
  183. }
  184. else
  185. {
  186. throw {
  187. ...response,
  188. error: `unsupported speech-to-text engine: ${engine}`,
  189. };
  190. }
  191. }
  192. /**
  193. * Transcribe the audio clip using the Google Cloud Speech-To-Text Engine.
  194. *
  195. * ref: https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize
  196. *
  197. * @protected
  198. * @param {String} transcriptionKey - the secret key to the Google service
  199. * @param {String} languageCode - the BCP-47 language code for the recognition, e.g. 'en-GB'
  200. * @return {Promise} a promise resolving to the transcript and associated
  201. * transcription confidence
  202. */
  203. _GoogleTranscribe(transcriptionKey, languageCode)
  204. {
  205. return new Promise(async (resolve, reject) =>
  206. {
  207. // convert the Float32 PCM audio data to UInt16:
  208. const buffer = new ArrayBuffer(this._audioData.length * 2);
  209. const uint16View = new Uint16Array(buffer);
  210. for (let t = 0; t < this._audioData.length; ++t)
  211. {
  212. uint16View[t] = (this._audioData[t] < 0)
  213. ? this._audioData[t] * 0x8000
  214. : this._audioData[t] * 0x7FFF;
  215. }
  216. // encode it to base64:
  217. const base64Data = this._base64ArrayBuffer(new Uint8Array(buffer));
  218. // query the Google speech-to-text service:
  219. const body = {
  220. config: {
  221. encoding: "LINEAR16",
  222. sampleRateHertz: this._sampleRateHz,
  223. languageCode,
  224. },
  225. audio: {
  226. content: base64Data,
  227. },
  228. };
  229. const url = `https://speech.googleapis.com/v1/speech:recognize?key=${transcriptionKey}`;
  230. const response = await fetch(url, {
  231. method: "POST",
  232. headers: {
  233. "Content-Type": "application/json",
  234. },
  235. body: JSON.stringify(body),
  236. });
  237. // convert the response to json:
  238. const decodedResponse = await response.json();
  239. this._psychoJS.logger.debug("speech.googleapis.com response:", JSON.stringify(decodedResponse));
  240. // TODO deal with more than one results and/or alternatives
  241. if (("results" in decodedResponse) && (decodedResponse.results.length > 0))
  242. {
  243. resolve(decodedResponse.results[0].alternatives[0]);
  244. }
  245. else
  246. {
  247. // no transcription available:
  248. resolve({
  249. transcript: "",
  250. confidence: -1,
  251. });
  252. }
  253. });
  254. }
  255. /**
  256. * Decode the formatted audio data (e.g. webm) into a 32bit float PCM audio buffer.
  257. *
  258. * @protected
  259. */
  260. _decodeAudio()
  261. {
  262. this._psychoJS.logger.debug("request to decode the data of the audio clip");
  263. // if the audio clip is ready, the PCM audio data is available in _audioData, a Float32Array:
  264. if (this._status === AudioClip.Status.READY)
  265. {
  266. return;
  267. }
  268. // if we are already decoding, wait until the process completed:
  269. if (this._status === AudioClip.Status.DECODING)
  270. {
  271. const self = this;
  272. return new Promise(function(resolve, reject)
  273. {
  274. self._decodingCallbacks.push(resolve);
  275. // self._errorCallback = reject; // TODO
  276. }.bind(this));
  277. }
  278. // otherwise, start decoding the input formatted audio data:
  279. this._status = AudioClip.Status.DECODING;
  280. this._audioData = null;
  281. this._source = null;
  282. this._gainNode = null;
  283. this._decodingCallbacks = [];
  284. this._audioContext = new (window.AudioContext || window.webkitAudioContext)({
  285. sampleRate: this._sampleRateHz,
  286. });
  287. const reader = new window.FileReader();
  288. reader.onloadend = async () =>
  289. {
  290. try
  291. {
  292. // decode the ArrayBuffer containing the formatted audio data (e.g. webm)
  293. // into an audio buffer:
  294. this._audioBuffer = await this._audioContext.decodeAudioData(reader.result);
  295. // get the Float32Array containing the PCM data:
  296. this._audioData = this._audioBuffer.getChannelData(0);
  297. // we are now ready to translate and play:
  298. this._status = AudioClip.Status.READY;
  299. // resolve all the promises waiting for the decoding to complete:
  300. for (const callback of this._decodingCallbacks)
  301. {
  302. callback();
  303. }
  304. }
  305. catch (error)
  306. {
  307. console.error(error);
  308. // TODO
  309. }
  310. };
  311. reader.onerror = (error) =>
  312. {
  313. // TODO
  314. };
  315. reader.readAsArrayBuffer(this._data);
  316. }
  317. /**
  318. * Convert an array buffer to a base64 string.
  319. *
  320. * @note this is heavily inspired by the following post by @Grantlyk:
  321. * https://gist.github.com/jonleighton/958841#gistcomment-1953137
  322. * It is necessary since the following approach only works for small buffers:
  323. * const dataAsString = String.fromCharCode.apply(null, new Uint8Array(buffer));
  324. * base64Data = window.btoa(dataAsString);
  325. *
  326. * @protected
  327. * @param arrayBuffer - the input buffer
  328. * @return {string} the base64 encoded input buffer
  329. */
  330. _base64ArrayBuffer(arrayBuffer)
  331. {
  332. let base64 = "";
  333. const encodings = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  334. const bytes = new Uint8Array(arrayBuffer);
  335. const byteLength = bytes.byteLength;
  336. const byteRemainder = byteLength % 3;
  337. const mainLength = byteLength - byteRemainder;
  338. let a;
  339. let b;
  340. let c;
  341. let d;
  342. let chunk;
  343. // Main loop deals with bytes in chunks of 3
  344. for (let i = 0; i < mainLength; i += 3)
  345. {
  346. // Combine the three bytes into a single integer
  347. chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2];
  348. // Use bitmasks to extract 6-bit segments from the triplet
  349. a = (chunk & 16515072) >> 18; // 16515072 = (2^6 - 1) << 18
  350. b = (chunk & 258048) >> 12; // 258048 = (2^6 - 1) << 12
  351. c = (chunk & 4032) >> 6; // 4032 = (2^6 - 1) << 6
  352. d = chunk & 63; // 63 = 2^6 - 1
  353. // Convert the raw binary segments to the appropriate ASCII encoding
  354. base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d];
  355. }
  356. // Deal with the remaining bytes and padding
  357. if (byteRemainder === 1)
  358. {
  359. chunk = bytes[mainLength];
  360. a = (chunk & 252) >> 2; // 252 = (2^6 - 1) << 2
  361. // Set the 4 least significant bits to zero
  362. b = (chunk & 3) << 4; // 3 = 2^2 - 1
  363. base64 += `${encodings[a]}${encodings[b]}==`;
  364. }
  365. else if (byteRemainder === 2)
  366. {
  367. chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1];
  368. a = (chunk & 64512) >> 10; // 64512 = (2^6 - 1) << 10
  369. b = (chunk & 1008) >> 4; // 1008 = (2^6 - 1) << 4
  370. // Set the 2 least significant bits to zero
  371. c = (chunk & 15) << 2; // 15 = 2^4 - 1
  372. base64 += `${encodings[a]}${encodings[b]}${encodings[c]}=`;
  373. }
  374. return base64;
  375. }
  376. }
  377. /**
  378. * Recognition engines.
  379. *
  380. * @enum {Symbol}
  381. * @readonly
  382. */
  383. AudioClip.Engine = {
  384. /**
  385. * Google Cloud Speech-to-Text.
  386. */
  387. GOOGLE: Symbol.for("GOOGLE"),
  388. };
  389. /**
  390. * AudioClip status.
  391. *
  392. * @enum {Symbol}
  393. * @readonly
  394. */
  395. AudioClip.Status = {
  396. CREATED: Symbol.for("CREATED"),
  397. DECODING: Symbol.for("DECODING"),
  398. READY: Symbol.for("READY"),
  399. };