chore: groundwork

2024-05-16 20:02:40 +00:00 · 2024-03-29 10:34:45 +01:00 · 2024-03-29 10:34:45 +01:00 · fbc6ca2270
parent b824480af7
commit fbc6ca2270
18 changed files with 499 additions and 0 deletions
--- a/packages/tests/src/transcription/benchmark.ts
+++ b/packages/tests/src/transcription/benchmark.ts
@ -0,0 +1,34 @@
+import { CpuInfo, cpus } from 'os'
+import { TranscriptionEngine } from '@peertube/transcription'
+
+const WER_TOLERANCE = 1
+const CER_TOLERANCE = 1
+
+interface TestResult {
+  WER: number
+  CER: number
+  duration: number
+  engine: TranscriptionEngine
+  dataThroughput: number // relevant ?
+  cpus: CpuInfo[]
+}
+
+// var os = require('os');
+//
+console.log(cpus())
+// console.log(os.totalmem());
+// console.log(os.freemem())
+
+const testsResults: Record<string, TestResult> = {
+  cpus: []
+}
+
+async function testTranscriptGeneration (transformerBackend: string, model: string, mediaFilePath: string) {
+  const testResults = {
+    WER: 3,
+    CER: 3,
+    duration: 3
+  }
+
+  return testResults
+}
--- a/packages/tests/src/transcription/whisper/engine/python.spec.ts
+++ b/packages/tests/src/transcription/whisper/engine/python.spec.ts
@ -0,0 +1,33 @@
+import { join } from 'path'
+import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
+import { remove, pathExistsSync } from 'fs-extra/esm.js'
+import { $ } from 'execa'
+import { expect } from 'chai'
+import { WhisperEngine } from '@peertube/transcription'
+
+describe('Whisper', function () {
+  const transcriptDirectory = join(root(), 'test-transcript')
+  const vttTranscriptPath = join(transcriptDirectory, 'test.vtt')
+
+  it('Should be present on the system', async function () {
+    await $`whisper`
+  })
+
+  it('Should run transcription on a media file without raising any errors', async function () {
+    const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4')
+    const whisperEngine = new WhisperEngine({ transcriptDirectory })
+    await whisperEngine.transcribe('tiny', mediaFilePath)
+  })
+
+  it('Should be create a vtt transcript file', async function () {
+    const mediaFilePath = buildAbsoluteFixturePath('video_very_long_10p.mp4')
+    const whisperEngine = new WhisperEngine({ transcriptDirectory })
+    const { } = await whisperEngine.transcribe('tiny', mediaFilePath)
+
+    expect(pathExistsSync(vttTranscriptPath)).to.be.true
+  })
+
+  after(async function () {
+    await remove(transcriptDirectory)
+  })
+})
--- a/packages/tests/tsconfig.json
+++ b/packages/tests/tsconfig.json
@ -16,6 +16,7 @@
    { "path": "../node-utils" },
    { "path": "../typescript-utils" },
    { "path": "../server-commands" },
+    { "path": "../transcription" },
    { "path": "../../server/tsconfig.lib.json" }
  ],
  "include": [
--- a/packages/transcription/README.md
+++ b/packages/transcription/README.md
@ -0,0 +1,21 @@
+
+ DeepLearningFramework vs training libraries
+
+
+
+
+```typescript
+interface DeepLearningFramework {
+  name: string
+}
+const deepLearningFrameworks: DeepLearningFramework = [
+  {
+    name: 'PyTorch',
+    distributed: true,
+    gpu: true
+  },
+  {
+    name: 'TensorFlow'
+  }
+]
+```
--- a/packages/transcription/package.json
+++ b/packages/transcription/package.json
@ -0,0 +1,19 @@
+{
+  "name": "@peertube/transcription",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "build": "tsc",
+    "watch": "tsc -w"
+  },
+  "dependencies": {}
+}
--- a/packages/transcription/src/index.ts
+++ b/packages/transcription/src/index.ts
@ -0,0 +1,4 @@
+export * from './whisper/index.js'
+export * from './transcription-engine.js'
+export * from './transcription-model.js'
+export * from './transcription-result.js'
--- a/packages/transcription/src/transcription-engine.ts
+++ b/packages/transcription/src/transcription-engine.ts
@ -0,0 +1,32 @@
+import { join } from 'path'
+import { root } from '@peertube/peertube-node-utils'
+import { TranscriptionModel } from './transcription-model.js'
+import { TranscriptionResult } from './transcription-result.js'
+
+export abstract class TranscriptionEngine {
+  public name: string
+  public description: string
+  public language: string
+  public requirements: string[]
+  public type: 'binary' | 'bindings' | 'ws'
+  public license: string
+  public forgeURL: string
+
+  public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
+  // There could be a default models.
+  // There could be a list of default models
+
+  public abstract transcribe (
+    model: TranscriptionModel | string,
+    mediaFilePath: string,
+    language: string,
+    outputFormat: string
+  ): Promise<TranscriptionResult>
+  public abstract loadModel (model: TranscriptionModel)
+  public abstract detectLanguage (): Promise<string>
+  public abstract supports (model: TranscriptionModel): boolean
+
+  static getModelName (model: TranscriptionModel | string) {
+    return typeof model === 'string' ? model : model.name
+  }
+}
--- a/packages/transcription/src/transcription-model.ts
+++ b/packages/transcription/src/transcription-model.ts
@ -0,0 +1,60 @@
+// Convert Whisper transformer model from PyTorch to ggml format
+// : e original Whisper PyTorch models provided by OpenAI a
+// ggml format in order to be able to load them in C/C++
+
+// In supervised machine learning, the artefact created after training that is used to make predictions on new data is called a model.
+// models can be saved in a file that can potentially be compressed, so typically model files have a binary file format
+// TensorFlow saves models as protocol buffer files, with a .pb file extension.
+// Keras saves models natively as .h5 file.
+// Scikit-Learn saves models as pickled python objects, with a .pkl file extension.
+// An older format for model serving based on XML, predictive model markup language (.pmml), is still usable on some frameworks, such as Scikit-Learn.
+
+// Training File Formats :
+// - petastorm
+// - npy
+// - tfrecords
+
+// Model Serving Serialization Formats
+// - pb
+// - mlmodel
+// onnx
+// pkl
+// older : h5 pmml
+
+// Hugging Face fine-tuned models to ggml format
+// or Whisper transformer model ?
+
+// ML models vs Transformer Model
+// Transcription Model
+
+// Other model file formats that are used include SparkML models that can be saved in MLeap file format and served in real-time using a MLleap model server (files are packaged in .zip format). Apple developed the .mlmodel file format to store models embedded in iOS applications as part of its Core ML framework (which has superior support for ObjectiveC and Swift languages). Applications trained in TensorFlow, Scikit-Learn, and other frameworks need to convert their model files to the .mlmodel file format for use on iOS, with tools like, coremltools and Tensorflow converter being available to help file format conversion. ONNX is a ML framework independent file format, supported by Microsoft, Facebook, and Amazon. In theory, any ML framework should be able to export its models in .onnx file format, so it offers great promise in unifying model serving across the different frameworks. However, as of late 2019, ONNX does not support all operations for the most popular ML frameworks (TensorFlow, PyTorch, Scikit-Learn), so ONNX is not yet practical for those frameworks. In PyTorch, the recommended way to serve models is to use Torch Script to trace and save a model as a .pt file and serve it from a C++ application.
+//
+//   One final file format to mention here is YAML that is used to package models as part of the MLFlow framework for ML pipelines on Spark. MLFlow stores a YAML file that describes the files it packages for model serving, so that deployment tools can understand the model file format and know what files to deploy.
+// // ModelServingFileSerializationFormats
+//   File formats: .pb, .onnx, .pkl, .mlmodel, .zip, .pmml, .pt
+// Inference: .pb files are served by TensorFlowServing Server;
+// .onnx files are served by Microsoft’s commercial model serving platorm;
+// .pkl files are served for Scikit-Learn models, often on Flask servers;
+// .mlmodel files are served by iOS platforms;
+// .zip files are used to package up MLeap files that are served on the MLeap runtime;
+// .pt files are use to package PyTorch models that can be served inside C++ applications.
+// .'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
+// https://towardsdatascience.com/guide-to-file-formats-for-machine-learning-columnar-training-inferencing-and-the-feature-store-2e0c3d18d4f9
+
+export abstract class TranscriptionModel {
+  name: string
+  format?: 'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
+  path?: string
+  url?: string
+
+  // #  - hparams
+  // #  - Number of dimensions (int)
+  // #  - Name length (int)
+  // #  - Dimensions (int[n_dims])
+  // #  - Name (char[name_length])
+  // #  - Data (float[n_dims])
+
+  // #  - mel filters
+  // #  - tokenizer vocab
+  // #  - model variables
+}
--- a/packages/transcription/src/transcription-result.ts
+++ b/packages/transcription/src/transcription-result.ts
@ -0,0 +1 @@
+export type TranscriptionResult = { transcriptFilePath: string, language?: string }
--- a/packages/transcription/src/whisper/engine/README.md
+++ b/packages/transcription/src/whisper/engine/README.md
@ -0,0 +1,81 @@
+- cpp
+- ctranslate2
+- faster-whisper
+- insanely-fast-whisper
+- whisper
+- transformers.js
+- whisperX
+
+Transformers* could be defined as an all-purpose inference engines instead of a whisper only engine :
+- to create a video summary
+-
+
+
+
+// mixed precision training
+// env.cacheDir = './.cache';
+// env.localModelPath = '/path/to/models/';
+// env.allowRemoteModels = false;
+// To optimize the data pipeline, you should use techniques such as
+// caching,
+// prefetching,
+// batching,
+// sharding, and
+// compression, depending on the characteristics and size of your data.
+// You should also monitor the data throughput and utilization of the GPU and CPU devices, and adjust the data pipeline accordingly.
+// 1) Prefetching: To load data asynchronously while the model is training on the current batch. This minimizes data loading bottlenecks.
+// 2) Data Sampling for initial models:  For initial model development or debugging, working with a smaller subset of your data to can help speedy setup and output.
+// 3) Parallel Processing: This is the most obvious point and important point. Utilize multi-threading or multiprocessing libraries like concurrent.futures in Python to preprocess data in parallel. This is particularly effective when dealing with large datasets.
+// https://www.linkedin.com/advice/3/how-can-you-optimize-machine-learning-models
+// Use mixed precision training
+// Apply model pruning and quantization
+//  Sizing the model will almost always help with performance,
+// On GPUs,
+// - leverage batch processing
+// - and mixed-precision training,
+// - manage GPU memory,
+// - and consider model pruning.
+// For CPUs,
+// - utilize multi-threading,
+// - efficient libraries,
+// - batch inference, quantization,
+// - and model optimization.
+// - Employ
+//     - compiler flags,
+//     - caching,
+//     - and distributed computing for CPU performance.
+// Profiling tools help identify bottlenecks on both hardware types, ensuring efficient model deployment in diverse environments.
+// The choice between GPU and CPU optimization depends on the specific task and hardware resources available.
+// Cela pourrait être chouette de pouvoir run des tests sur des runners gpu depuis Github Actions :
+//   https://resources.github.com/devops/accelerate-your-cicd-with-arm-and-gpu-runners-in-github-actions/
+
+// Techniques such as
+// model quantization, pruning,
+// and other optimizations can further enhance the efficiency of running these models on CPU hardware.
+// If you're looking to deploy Whisper models on CPU-based systems, you can use popular deep learning frameworks like TensorFlow or PyTorch, which provide support for deploying models on CPU and offer optimizations for inference performance. Additionally, platforms like ONNX Runtime or TensorFlow Lite offer optimizations for inference on CPU, including support for quantized models and hardware acceleration where available.
+
+// https://eval.ai/web/challenges/challenge-page/1637/overview
+// https://github.com/fquirin/speech-recognition-experiments
+
+
+
+
+
+
+
+
+// => are producting models
+
+
+// PyTorch and TensorFlow
+// deepLearningFramework
+// cpp.ts
+// ctranslate2.ts
+// faster.ts
+// insanely-fast.ts
+// python.ts
+// transformer.ts
+// X .ts
+
+// whisper.cpp
+// ggml
--- a/packages/transcription/src/whisper/engine/cpp.ts
+++ b/packages/transcription/src/whisper/engine/cpp.ts
@ -0,0 +1,38 @@
+import { existsSync } from 'fs'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { TranscriptionEngine } from '../../transcription-engine.js'
+import { Promise } from 'bluebird'
+import { TranscriptionResult } from '../../transcription-result.js'
+
+export class WhisperCppEngine implements TranscriptionEngine {
+  name = 'transformers'
+  description = 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model'
+  type: 'binary'
+  language = 'cpp'
+  requirements = []
+  forgeURL = 'https://github.com/ggerganov/whisper.cpp'
+  license = 'MIT'
+
+  detectLanguage () {
+    return Promise.resolve('')
+  }
+
+  loadModel (model: TranscriptionModel) {
+    if (existsSync(model.path)) { /* empty */ }
+  }
+
+  supports (model: TranscriptionModel) {
+    return true
+  }
+
+  transcribe (
+    model: TranscriptionModel | string,
+    mediaFilePath: string,
+    language: string,
+    outputFormat: string
+  ): Promise<TranscriptionResult> {
+    return Promise.resolve(undefined)
+  }
+}
+
+export const whisperCppEngine = new WhisperCppEngine()
--- a/packages/transcription/src/whisper/engine/engines.ts
+++ b/packages/transcription/src/whisper/engine/engines.ts
@ -0,0 +1,12 @@
+import { TranscriptionEngine } from '../../transcription-engine.js'
+import { whisperEngine } from './python.js'
+import { whisperCppEngine } from './cpp.js'
+import { transformers } from './transformers.js'
+import { transformersJs } from './transformers-js.js'
+
+export const engines: TranscriptionEngine[] = [
+  whisperCppEngine,
+  whisperEngine,
+  transformers,
+  transformersJs
+]
--- a/packages/transcription/src/whisper/engine/index.ts
+++ b/packages/transcription/src/whisper/engine/index.ts
@ -0,0 +1,4 @@
+export * from './cpp.js'
+export * from './python.js'
+export * from './transformers.js'
+export * from './transformers-js.js'
--- a/packages/transcription/src/whisper/engine/python.ts
+++ b/packages/transcription/src/whisper/engine/python.ts
@ -0,0 +1,65 @@
+import { existsSync } from 'fs'
+import { join } from 'path'
+import { ChildProcess } from 'child_process'
+import { $ } from 'execa'
+import { TranscriptionEngine } from '../../transcription-engine.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { TranscriptionResult } from '../../transcription-result.js'
+
+type TranscriptFormat = 'txt' | 'vtt' | 'srt'
+
+export class WhisperEngine implements TranscriptionEngine {
+  name: 'whisper'
+  description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model'
+  requirements: ['python', 'pyTorch', 'ffmpeg']
+  language: 'python'
+  type: 'binary'
+  binary: string
+  forgeURL: 'https://github.com/openai/whisper'
+  license: 'MIT'
+  process?: ChildProcess
+  transcriptDirectory: string
+
+  public constructor (transcriptDirectory: WhisperEngine['transcriptDirectory'] = TranscriptionEngine.DEFAULT_TRANSCRIPT_DIRECTORY) {
+    this.transcriptDirectory = transcriptDirectory
+  }
+
+  detectLanguage () {
+    return Promise.resolve('')
+  }
+
+  loadModel (model: TranscriptionModel) {
+    if (existsSync(model.path)) { /* empty */ }
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'PyTorch'
+  }
+
+  async transcribe (
+    model: TranscriptionModel | string,
+    mediaFilePath: string,
+    format: TranscriptFormat = 'vtt'
+  ): Promise<TranscriptionResult> {
+    const $$ = $({ verbose: true })
+
+    await $$`whisper ${[
+      mediaFilePath,
+      '--model',
+      TranscriptionEngine.getModelName(model),
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory
+    ]}`
+
+    await $$`ls ${this.transcriptDirectory}`
+
+    return {
+      language: '',
+      transcriptFilePath: join(this.transcriptDirectory, `test.${format}`)
+    }
+  }
+}
+
+export const whisperEngine = new WhisperEngine()
--- a/packages/transcription/src/whisper/engine/transformers-js.ts
+++ b/packages/transcription/src/whisper/engine/transformers-js.ts
@ -0,0 +1,42 @@
+// import { pipeline, env } from '@xenova/transformers'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { TranscriptionEngine } from '../../transcription-engine.js'
+import { TranscriptionResult } from '../../transcription-result.js'
+import { Promise } from 'bluebird'
+
+// Disable local models
+// env.allowLocalModels = true
+
+class TransformersJs implements TranscriptionEngine {
+  name = 'transformers.js'
+  description = ''
+  requirements = []
+  language = 'js'
+  forgeURL: string
+  license: string
+  type: 'bindings'
+
+  transcribe (
+    model: TranscriptionModel | string,
+    mediaFilePath: string,
+    language: string, outputFormat: string): Promise<TranscriptionResult> {
+    return Promise.resolve(undefined)
+    // return pipeline('automatic-speech-recognition', 'no_attentions', {
+    //   // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
+    //   revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main'
+    // })
+  }
+
+  detectLanguage (): Promise<string> {
+    return Promise.resolve('')
+  }
+
+  loadModel (model: TranscriptionModel) {
+  }
+
+  supports (model: TranscriptionModel): boolean {
+    return false
+  }
+}
+
+export const transformersJs = new TransformersJs()
--- a/packages/transcription/src/whisper/engine/transformers.ts
+++ b/packages/transcription/src/whisper/engine/transformers.ts
@ -0,0 +1,38 @@
+import { TranscriptionEngine } from '../../transcription-engine.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { existsSync } from 'fs'
+import { TranscriptionResult } from '../../transcription-result.js'
+import { Promise } from 'bluebird'
+
+export class Transformers implements TranscriptionEngine {
+  name = 'transformers'
+  description = 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model'
+  type: 'binary'
+  language = 'cpp'
+  requirements = []
+  forgeURL = 'https://github.com/ggerganov/whisper.cpp'
+  license = 'MIT'
+
+  supports (model: TranscriptionModel) {
+    return true
+  }
+
+  detectLanguage () {
+    return Promise.resolve('')
+  }
+
+  loadModel (model: TranscriptionModel) {
+    if (existsSync(model.path)) { /* empty */ }
+  }
+
+  transcribe (
+    model: TranscriptionModel | string,
+    mediaFilePath: string,
+    language: string,
+    outputFormat: string
+  ): Promise<TranscriptionResult> {
+    return Promise.resolve(undefined)
+  }
+}
+
+export const transformers = new Transformers()
--- a/packages/transcription/src/whisper/index.ts
+++ b/packages/transcription/src/whisper/index.ts
@ -0,0 +1 @@
+export * from './engine/index.js'
--- a/packages/transcription/tsconfig.json
+++ b/packages/transcription/tsconfig.json
@ -0,0 +1,13 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo"
+  },
+  "references": [
+    { "path": "../models" },
+    { "path": "../core-utils" },
+    { "path": "../node-utils" }
+  ]
+}
				`@ -0,0 +1 @@`
				`export type TranscriptionResult = { transcriptFilePath: string, language?: string }`