Spaces:
Paused
Paused
| import { getRequestHeaders } from '../script.js'; | |
| import { renderExtensionTemplateAsync } from './extensions.js'; | |
| import { POPUP_RESULT, POPUP_TYPE, callGenericPopup } from './popup.js'; | |
| import { SlashCommand } from './slash-commands/SlashCommand.js'; | |
| import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from './slash-commands/SlashCommandArgument.js'; | |
| import { SlashCommandParser } from './slash-commands/SlashCommandParser.js'; | |
| import { isValidUrl } from './utils.js'; | |
| /** | |
| * @typedef {Object} Scraper | |
| * @property {string} id | |
| * @property {string} name | |
| * @property {string} description | |
| * @property {string} iconClass | |
| * @property {boolean} iconAvailable | |
| * @property {() => Promise<void>} [init=null] | |
| * @property {() => Promise<boolean>} isAvailable | |
| * @property {() => Promise<File[]>} scrape | |
| */ | |
| /** | |
| * @typedef {Object} ScraperInfo | |
| * @property {string} id | |
| * @property {string} name | |
| * @property {string} description | |
| * @property {string} iconClass | |
| * @property {boolean} iconAvailable | |
| */ | |
| export class ScraperManager { | |
| /** | |
| * @type {Scraper[]} | |
| */ | |
| static #scrapers = []; | |
| /** | |
| * Register a scraper to be used by the Data Bank. | |
| * @param {Scraper} scraper Instance of a scraper to register | |
| */ | |
| static async registerDataBankScraper(scraper) { | |
| if (ScraperManager.#scrapers.some(s => s.id === scraper.id)) { | |
| console.warn(`Scraper with ID ${scraper.id} already registered`); | |
| return; | |
| } | |
| if (scraper.init) { | |
| await scraper.init(); | |
| } | |
| ScraperManager.#scrapers.push(scraper); | |
| } | |
| /** | |
| * Gets a list of scrapers available for the Data Bank. | |
| * @returns {ScraperInfo[]} List of scrapers available for the Data Bank | |
| */ | |
| static getDataBankScrapers() { | |
| return ScraperManager.#scrapers.map(s => ({ id: s.id, name: s.name, description: s.description, iconClass: s.iconClass, iconAvailable: s.iconAvailable })); | |
| } | |
| /** | |
| * Run a scraper to scrape data into the Data Bank. | |
| * @param {string} scraperId ID of the scraper to run | |
| * @returns {Promise<File[]>} List of files scraped by the scraper | |
| */ | |
| static runDataBankScraper(scraperId) { | |
| const scraper = ScraperManager.#scrapers.find(s => s.id === scraperId); | |
| if (!scraper) { | |
| console.warn(`Scraper with ID ${scraperId} not found`); | |
| return; | |
| } | |
| return scraper.scrape(); | |
| } | |
| /** | |
| * Check if a scraper is available. | |
| * @param {string} scraperId ID of the scraper to check | |
| * @returns {Promise<boolean>} Whether the scraper is available | |
| */ | |
| static isScraperAvailable(scraperId) { | |
| const scraper = ScraperManager.#scrapers.find(s => s.id === scraperId); | |
| if (!scraper) { | |
| console.warn(`Scraper with ID ${scraperId} not found`); | |
| return; | |
| } | |
| return scraper.isAvailable(); | |
| } | |
| } | |
| /** | |
| * Create a text file from a string. | |
| * @implements {Scraper} | |
| */ | |
| class Notepad { | |
| constructor() { | |
| this.id = 'text'; | |
| this.name = 'Notepad'; | |
| this.description = 'Create a text file from scratch.'; | |
| this.iconClass = 'fa-solid fa-note-sticky'; | |
| this.iconAvailable = true; | |
| } | |
| /** | |
| * Check if the scraper is available. | |
| * @returns {Promise<boolean>} | |
| */ | |
| async isAvailable() { | |
| return true; | |
| } | |
| /** | |
| * Create a text file from a string. | |
| * @returns {Promise<File[]>} File attachments scraped from the text | |
| */ | |
| async scrape() { | |
| const template = $(await renderExtensionTemplateAsync('attachments', 'notepad', {})); | |
| let fileName = `Untitled - ${new Date().toLocaleString()}`; | |
| let text = ''; | |
| template.find('input[name="notepadFileName"]').val(fileName).on('input', function () { | |
| fileName = String($(this).val()).trim(); | |
| }); | |
| template.find('textarea[name="notepadFileContent"]').on('input', function () { | |
| text = String($(this).val()); | |
| }); | |
| const result = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: true, large: true, okButton: 'Save', cancelButton: 'Cancel' }); | |
| if (!result || text === '') { | |
| return; | |
| } | |
| const file = new File([text], `Notepad - ${fileName}.txt`, { type: 'text/plain' }); | |
| return [file]; | |
| } | |
| } | |
| /** | |
| * Scrape data from a webpage. | |
| * @implements {Scraper} | |
| */ | |
| class WebScraper { | |
| constructor() { | |
| this.id = 'web'; | |
| this.name = 'Web'; | |
| this.description = 'Download a page from the web.'; | |
| this.iconClass = 'fa-solid fa-globe'; | |
| this.iconAvailable = true; | |
| } | |
| /** | |
| * Check if the scraper is available. | |
| * @returns {Promise<boolean>} | |
| */ | |
| async isAvailable() { | |
| return true; | |
| } | |
| /** | |
| * Parse the title of an HTML file from a Blob. | |
| * @param {Blob} blob Blob of the HTML file | |
| * @returns {Promise<string>} Title of the HTML file | |
| */ | |
| async getTitleFromHtmlBlob(blob) { | |
| const text = await blob.text(); | |
| const titleMatch = text.match(/<title>(.*?)<\/title>/i); | |
| return titleMatch ? titleMatch[1] : ''; | |
| } | |
| /** | |
| * Scrape file attachments from a webpage. | |
| * @returns {Promise<File[]>} File attachments scraped from the webpage | |
| */ | |
| async scrape() { | |
| const template = $(await renderExtensionTemplateAsync('attachments', 'web-scrape', {})); | |
| const linksString = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 4 }); | |
| if (!linksString) { | |
| return; | |
| } | |
| const links = String(linksString).split('\n').map(l => l.trim()).filter(l => l).filter(l => isValidUrl(l)); | |
| if (links.length === 0) { | |
| toastr.error('Invalid URL'); | |
| return; | |
| } | |
| const toast = toastr.info('Working, please wait...'); | |
| const files = []; | |
| for (const link of links) { | |
| const result = await fetch('/api/search/visit', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| body: JSON.stringify({ url: link }), | |
| }); | |
| const blob = await result.blob(); | |
| const domain = new URL(link).hostname; | |
| const timestamp = Date.now(); | |
| const title = await this.getTitleFromHtmlBlob(blob) || 'webpage'; | |
| const file = new File([blob], `${title} - ${domain} - ${timestamp}.html`, { type: 'text/html' }); | |
| files.push(file); | |
| } | |
| toastr.clear(toast); | |
| return files; | |
| } | |
| } | |
| /** | |
| * Scrape data from a file selection. | |
| * @implements {Scraper} | |
| */ | |
| class FileScraper { | |
| constructor() { | |
| this.id = 'file'; | |
| this.name = 'File'; | |
| this.description = 'Upload a file from your computer.'; | |
| this.iconClass = 'fa-solid fa-upload'; | |
| this.iconAvailable = true; | |
| } | |
| /** | |
| * Check if the scraper is available. | |
| * @returns {Promise<boolean>} | |
| */ | |
| async isAvailable() { | |
| return true; | |
| } | |
| /** | |
| * Scrape file attachments from a file. | |
| * @returns {Promise<File[]>} File attachments scraped from the files | |
| */ | |
| async scrape() { | |
| return new Promise(resolve => { | |
| const fileInput = document.createElement('input'); | |
| fileInput.type = 'file'; | |
| fileInput.accept = '*/*'; | |
| fileInput.multiple = true; | |
| fileInput.onchange = () => resolve(Array.from(fileInput.files)); | |
| fileInput.click(); | |
| }); | |
| } | |
| } | |
| class MediaWikiScraper { | |
| constructor() { | |
| this.id = 'mediawiki'; | |
| this.name = 'MediaWiki'; | |
| this.description = 'Download a page from a MediaWiki wiki.'; | |
| this.iconClass = 'fa-brands fa-wikipedia-w'; | |
| this.iconAvailable = true; | |
| } | |
| async isAvailable() { | |
| try { | |
| const result = await fetch('/api/plugins/fandom/probe-mediawiki', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| }); | |
| return result.ok; | |
| } catch (error) { | |
| console.debug('Could not probe Fandom/MediaWiki plugin', error); | |
| return false; | |
| } | |
| } | |
| async scrape() { | |
| let url = ''; | |
| let filter = ''; | |
| let output = 'single'; | |
| const template = $(await renderExtensionTemplateAsync('attachments', 'mediawiki-scrape', {})); | |
| template.find('input[name="scrapeInput"]').on('input', function () { | |
| url = String($(this).val()).trim(); | |
| }); | |
| template.find('input[name="scrapeFilter"]').on('input', function () { | |
| filter = String($(this).val()); | |
| }); | |
| template.find('input[name="scrapeOutput"]').on('input', function () { | |
| output = String($(this).val()); | |
| }); | |
| const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' }); | |
| if (confirm !== POPUP_RESULT.AFFIRMATIVE) { | |
| return; | |
| } | |
| if (!url) { | |
| toastr.error('URL name is required'); | |
| return; | |
| } | |
| const toast = toastr.info('Working, please wait...'); | |
| const result = await fetch('/api/plugins/fandom/scrape-mediawiki', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| body: JSON.stringify({ url, filter }), | |
| }); | |
| if (!result.ok) { | |
| const error = await result.text(); | |
| throw new Error(error); | |
| } | |
| const data = await result.json(); | |
| toastr.clear(toast); | |
| if (output === 'multi') { | |
| const files = []; | |
| for (const attachment of data) { | |
| const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' }); | |
| files.push(file); | |
| } | |
| return files; | |
| } | |
| if (output === 'single') { | |
| const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n'); | |
| const file = new File([combinedContent], `${url}.txt`, { type: 'text/plain' }); | |
| return [file]; | |
| } | |
| return []; | |
| } | |
| } | |
| /** | |
| * Scrape data from a Fandom wiki. | |
| * @implements {Scraper} | |
| */ | |
| class FandomScraper { | |
| constructor() { | |
| this.id = 'fandom'; | |
| this.name = 'Fandom'; | |
| this.description = 'Download a page from the Fandom wiki.'; | |
| this.iconClass = 'fa-solid fa-fire'; | |
| this.iconAvailable = true; | |
| } | |
| /** | |
| * Check if the scraper is available. | |
| * @returns {Promise<boolean>} | |
| */ | |
| async isAvailable() { | |
| try { | |
| const result = await fetch('/api/plugins/fandom/probe', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| }); | |
| return result.ok; | |
| } catch (error) { | |
| console.debug('Could not probe Fandom plugin', error); | |
| return false; | |
| } | |
| } | |
| /** | |
| * Get the ID of a fandom from a URL or name. | |
| * @param {string} fandom URL or name of the fandom | |
| * @returns {string} ID of the fandom | |
| */ | |
| getFandomId(fandom) { | |
| try { | |
| const url = new URL(fandom); | |
| return url.hostname.split('.')[0] || fandom; | |
| } catch { | |
| return fandom; | |
| } | |
| } | |
| async scrape() { | |
| let fandom = ''; | |
| let filter = ''; | |
| let output = 'single'; | |
| const template = $(await renderExtensionTemplateAsync('attachments', 'fandom-scrape', {})); | |
| template.find('input[name="fandomScrapeInput"]').on('input', function () { | |
| fandom = String($(this).val()).trim(); | |
| }); | |
| template.find('input[name="fandomScrapeFilter"]').on('input', function () { | |
| filter = String($(this).val()); | |
| }); | |
| template.find('input[name="fandomScrapeOutput"]').on('input', function () { | |
| output = String($(this).val()); | |
| }); | |
| const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' }); | |
| if (confirm !== POPUP_RESULT.AFFIRMATIVE) { | |
| return; | |
| } | |
| if (!fandom) { | |
| toastr.error('Fandom name is required'); | |
| return; | |
| } | |
| const toast = toastr.info('Working, please wait...'); | |
| const result = await fetch('/api/plugins/fandom/scrape', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| body: JSON.stringify({ fandom, filter }), | |
| }); | |
| if (!result.ok) { | |
| const error = await result.text(); | |
| throw new Error(error); | |
| } | |
| const data = await result.json(); | |
| toastr.clear(toast); | |
| if (output === 'multi') { | |
| const files = []; | |
| for (const attachment of data) { | |
| const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' }); | |
| files.push(file); | |
| } | |
| return files; | |
| } | |
| if (output === 'single') { | |
| const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n'); | |
| const file = new File([combinedContent], `${fandom}.txt`, { type: 'text/plain' }); | |
| return [file]; | |
| } | |
| return []; | |
| } | |
| } | |
| const iso6391Codes = [ | |
| 'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', | |
| 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', | |
| 'ch', 'co', 'cr', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', | |
| 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', | |
| 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', | |
| 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io', 'is', | |
| 'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', | |
| 'ko', 'kr', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln', | |
| 'lo', 'lt', 'lu', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', | |
| 'mt', 'my', 'na', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv', | |
| 'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pi', 'pl', 'ps', 'pt', 'qu', | |
| 'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'si', 'sk', | |
| 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta', | |
| 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw', | |
| 'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi', | |
| 'yo', 'za', 'zh', 'zu']; | |
| /** | |
| * Scrape transcript from a YouTube video. | |
| * @implements {Scraper} | |
| */ | |
| class YouTubeScraper { | |
| constructor() { | |
| this.id = 'youtube'; | |
| this.name = 'YouTube'; | |
| this.description = 'Download a transcript from a YouTube video.'; | |
| this.iconClass = 'fa-brands fa-youtube'; | |
| this.iconAvailable = true; | |
| } | |
| async init() { | |
| SlashCommandParser.addCommandObject(SlashCommand.fromProps({ | |
| name: 'yt-script', | |
| callback: async (args, url) => { | |
| try { | |
| if (!url) { | |
| throw new Error('URL or ID of the YouTube video is required'); | |
| } | |
| const lang = String(args?.lang || ''); | |
| const { transcript } = await this.getScript(String(url).trim(), lang); | |
| return transcript; | |
| } catch (error) { | |
| toastr.error(error.message); | |
| return ''; | |
| } | |
| }, | |
| helpString: 'Scrape a transcript from a YouTube video by ID or URL.', | |
| returns: ARGUMENT_TYPE.STRING, | |
| namedArgumentList: [ | |
| new SlashCommandNamedArgument('lang', 'ISO 639-1 language code of the transcript, e.g. "en"', ARGUMENT_TYPE.STRING, false, false, '', iso6391Codes), | |
| ], | |
| unnamedArgumentList: [ | |
| new SlashCommandArgument('URL or ID of the YouTube video', ARGUMENT_TYPE.STRING, true, false), | |
| ], | |
| })); | |
| } | |
| /** | |
| * Check if the scraper is available. | |
| * @returns {Promise<boolean>} | |
| */ | |
| async isAvailable() { | |
| return true; | |
| } | |
| /** | |
| * Parse the ID of a YouTube video from a URL. | |
| * @param {string} url URL of the YouTube video | |
| * @returns {string} ID of the YouTube video | |
| */ | |
| parseId(url) { | |
| // If the URL is already an ID, return it | |
| if (/^[a-zA-Z0-9_-]{11}$/.test(url)) { | |
| return url; | |
| } | |
| const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/; | |
| const match = url.match(regex); | |
| return (match?.length && match[1] ? match[1] : url); | |
| } | |
| /** | |
| * Scrape transcript from a YouTube video. | |
| * @returns {Promise<File[]>} File attachments scraped from the YouTube video | |
| */ | |
| async scrape() { | |
| let lang = ''; | |
| const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {})); | |
| const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 }); | |
| template.find('input[name="youtubeLanguageCode"]').on('input', function () { | |
| lang = String($(this).val()).trim(); | |
| }); | |
| if (!videoUrl) { | |
| return; | |
| } | |
| const toast = toastr.info('Working, please wait...'); | |
| const { transcript, id } = await this.getScript(String(videoUrl), lang); | |
| toastr.clear(toast); | |
| const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' }); | |
| return [file]; | |
| } | |
| /** | |
| * Fetches the transcript of a YouTube video. | |
| * @param {string} videoUrl Video URL or ID | |
| * @param {string} lang Video language | |
| * @returns {Promise<{ transcript: string, id: string }>} Transcript of the YouTube video with the video ID | |
| */ | |
| async getScript(videoUrl, lang) { | |
| const id = this.parseId(String(videoUrl).trim()); | |
| const result = await fetch('/api/search/transcript', { | |
| method: 'POST', | |
| headers: getRequestHeaders(), | |
| body: JSON.stringify({ id, lang }), | |
| }); | |
| if (!result.ok) { | |
| const error = await result.text(); | |
| throw new Error(error); | |
| } | |
| const transcript = await result.text(); | |
| return { transcript, id }; | |
| } | |
| } | |
| export async function initScrapers() { | |
| await ScraperManager.registerDataBankScraper(new FileScraper()); | |
| await ScraperManager.registerDataBankScraper(new Notepad()); | |
| await ScraperManager.registerDataBankScraper(new WebScraper()); | |
| await ScraperManager.registerDataBankScraper(new MediaWikiScraper()); | |
| await ScraperManager.registerDataBankScraper(new FandomScraper()); | |
| await ScraperManager.registerDataBankScraper(new YouTubeScraper()); | |
| } | |