infio-copilot-dev/src/utils/youtube-transcript.ts

/**
 * This source code is licensed under the MIT license.
 * Original source: https://github.com/Kakulukian/youtube-transcript
 *
 * Modified from the original code
 */

import { requestUrl } from 'obsidian'

const RE_YOUTUBE =
	/(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i
const USER_AGENT =
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)'
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g

export function isYoutubeUrl(url: string) {
	return RE_YOUTUBE.test(url)
}

export class YoutubeTranscriptError extends Error {
	constructor(message: string) {
		super(`[YoutubeTranscript] 🚨 ${message}`)
	}
}

export class YoutubeTranscriptTooManyRequestError extends YoutubeTranscriptError {
	constructor() {
		super(
			'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue',
		)
	}
}

export class YoutubeTranscriptVideoUnavailableError extends YoutubeTranscriptError {
	constructor(videoId: string) {
		super(`The video is no longer available (${videoId})`)
	}
}

export class YoutubeTranscriptDisabledError extends YoutubeTranscriptError {
	constructor(videoId: string) {
		super(`Transcript is disabled on this video (${videoId})`)
	}
}

export class YoutubeTranscriptNotAvailableError extends YoutubeTranscriptError {
	constructor(videoId: string) {
		super(`No transcripts are available for this video (${videoId})`)
	}
}

export class YoutubeTranscriptNotAvailableLanguageError extends YoutubeTranscriptError {
	constructor(lang: string, availableLangs: string[], videoId: string) {
		super(
			`No transcripts are available in ${lang} this video (${videoId}). Available languages: ${availableLangs.join(
				', ',
			)}`,
		)
	}
}

export type TranscriptConfig = {
	lang?: string
}
export type Transcript = {
	text: string
	duration: number
	offset: number
	lang?: string
}

export type TranscriptAndMetadataResponse = {
	title: string
	transcript: Transcript[]
}

/**
 * Class to retrieve transcript if exist
 */
export class YoutubeTranscript {
	/**
	 * Fetch transcript from YTB Video
	 * @param videoId Video url or video identifier
	 * @param config Get transcript in a specific language ISO
	 */
	public static async fetchTranscriptAndMetadata(
		videoId: string,
		config?: TranscriptConfig,
	): Promise<TranscriptAndMetadataResponse> {
		const identifier = this.retrieveVideoId(videoId)
		const videoPageResponse = await requestUrl({
			url: `https://www.youtube.com/watch?v=${identifier}`,
			headers: {
				...(config?.lang && { 'Accept-Language': config.lang }),
				'User-Agent': USER_AGENT,
			},
		})
		const videoPageBody = videoPageResponse.text

		// Extract title using regex from <title> tags
		const titleMatch = /<title>(.*?)<\/title>/.exec(videoPageBody)
		const title = titleMatch
			? titleMatch[1].replace(' - YouTube', '').trim()
			: ''

		const splittedHTML = videoPageBody.split('"captions":')

		if (splittedHTML.length <= 1) {
			if (videoPageBody.includes('class="g-recaptcha"')) {
				throw new YoutubeTranscriptTooManyRequestError()
			}
			if (!videoPageBody.includes('"playabilityStatus":')) {
				throw new YoutubeTranscriptVideoUnavailableError(videoId)
			}
			throw new YoutubeTranscriptDisabledError(videoId)
		}

		const captions = (() => {
			try {
				// eslint-disable-next-line @typescript-eslint/no-unsafe-return
				return JSON.parse(
					splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''),
				)
			} catch (e) {
				return undefined
			}
		})()?.playerCaptionsTracklistRenderer

		if (!captions) {
			throw new YoutubeTranscriptDisabledError(videoId)
		}

		if (!('captionTracks' in captions)) {
			throw new YoutubeTranscriptNotAvailableError(videoId)
		}

		if (
			config?.lang &&
			!captions.captionTracks.some(
				// eslint-disable-next-line @typescript-eslint/no-explicit-any
				(track: any) => track.languageCode === config?.lang,
			)
		) {
			throw new YoutubeTranscriptNotAvailableLanguageError(
				config?.lang,
				// eslint-disable-next-line @typescript-eslint/no-unsafe-argument, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-return
				captions.captionTracks.map((track: any) => track.languageCode),
				videoId,
			)
		}

		const transcriptURL: string = (
			config?.lang
				? captions.captionTracks.find(
					// eslint-disable-next-line @typescript-eslint/no-explicit-any
					(track: any) => track.languageCode === config?.lang,
				)
				: captions.captionTracks[0]
		).baseUrl

		const transcriptResponse = await requestUrl({
			url: transcriptURL,
			headers: {
				...(config?.lang && { 'Accept-Language': config.lang }),
				'User-Agent': USER_AGENT,
			},
		})
		if (transcriptResponse.status !== 200) {
			throw new YoutubeTranscriptNotAvailableError(videoId)
		}
		const transcriptBody = transcriptResponse.text
		const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)]
		return {
			title,
			transcript: results.map((result) => ({
				text: result[3],
				duration: parseFloat(result[2]),
				offset: parseFloat(result[1]),
				lang: config?.lang ?? captions.captionTracks[0].languageCode,
			})),
		}
	}

	/**
	 * Retrieve video id from url or string
	 * @param videoId video url or video id
	 */
	private static retrieveVideoId(videoId: string) {
		if (videoId.length === 11) {
			return videoId
		}
		const matchId = RE_YOUTUBE.exec(videoId)
		if (matchId?.length) {
			return matchId[1]
		}
		throw new YoutubeTranscriptError('Impossible to retrieve Youtube video ID.')
	}
}