| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  | /** | 
					
						
							|  |  |  |  * Some usefull utility functions around the wikipedia API | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  | import { Utils } from "../../Utils" | 
					
						
							|  |  |  | import { UIEventSource } from "../UIEventSource" | 
					
						
							|  |  |  | import { WikipediaBoxOptions } from "../../UI/Wikipedia/WikipediaBox" | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | export default class Wikipedia { | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set. | 
					
						
							|  |  |  |      * We do this based on the classes. This set contains a blacklist of the classes to remove | 
					
						
							|  |  |  |      * @private | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     private static readonly classesToRemove = [ | 
					
						
							|  |  |  |         "shortdescription", | 
					
						
							|  |  |  |         "sidebar", | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         "infobox", | 
					
						
							|  |  |  |         "infobox_v2", | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  |         "noprint", | 
					
						
							|  |  |  |         "ambox", | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  |         "mw-editsection", | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  |         "mw-selflink", | 
					
						
							| 
									
										
										
										
											2021-10-18 20:40:24 +02:00
										 |  |  |         "mw-empty-elt", | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         "hatnote", // Often redirects
 | 
					
						
							| 
									
										
										
										
											2021-10-07 22:06:47 +02:00
										 |  |  |     ] | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     private static readonly idsToRemove = ["sjabloon_zie"] | 
					
						
							| 
									
										
										
										
											2021-11-07 16:34:51 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     private static readonly _cache = new Map< | 
					
						
							|  |  |  |         string, | 
					
						
							|  |  |  |         UIEventSource<{ success: string } | { error: any }> | 
					
						
							|  |  |  |     >() | 
					
						
							| 
									
										
										
										
											2021-11-07 16:34:51 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public readonly backend: string | 
					
						
							| 
									
										
										
										
											2022-04-30 00:30:15 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     constructor(options?: { language?: "en" | string } | { backend?: string }) { | 
					
						
							|  |  |  |         this.backend = Wikipedia.getBackendUrl(options ?? {}) | 
					
						
							| 
									
										
										
										
											2022-04-30 00:30:15 +02:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Tries to extract the language and article name from the given string | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |      * | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |      * Wikipedia.extractLanguageAndName("qsdf") // => undefined
 | 
					
						
							| 
									
										
										
										
											2022-05-01 21:05:58 +02:00
										 |  |  |      * Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
 | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |      */ | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public static extractLanguageAndName(input: string): { language: string; pageName: string } { | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |         const matched = input.match("([^:]+):(.*)") | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         if (matched === undefined || matched === null) { | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |             return undefined | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         const [_, language, pageName] = matched | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |         return { | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |             language, | 
					
						
							|  |  |  |             pageName, | 
					
						
							| 
									
										
										
										
											2022-05-01 20:56:16 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * Extracts the actual pagename; returns undefined if this came from a different wikimedia entry | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |      * | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |      * new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
 | 
					
						
							|  |  |  |      * new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
 | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public extractPageName(input: string): string | undefined { | 
					
						
							|  |  |  |         if (!input.startsWith(this.backend)) { | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |             return undefined | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         input = input.substring(this.backend.length) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         const matched = input.match("/?wiki/(.+)") | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         if (matched === undefined || matched === null) { | 
					
						
							|  |  |  |             return undefined | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         const [_, pageName] = matched | 
					
						
							|  |  |  |         return pageName | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     private static getBackendUrl( | 
					
						
							|  |  |  |         options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string } | 
					
						
							|  |  |  |     ): string { | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         let backend = "en.wikipedia.org" | 
					
						
							|  |  |  |         if (options["backend"]) { | 
					
						
							|  |  |  |             backend = options["backend"] | 
					
						
							|  |  |  |         } else if (options["language"]) { | 
					
						
							|  |  |  |             backend = `${options["language"] ?? "en"}.wikipedia.org` | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if (!backend.startsWith("http")) { | 
					
						
							|  |  |  |             backend = "https://" + backend | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         return backend | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public GetArticle( | 
					
						
							|  |  |  |         pageName: string, | 
					
						
							|  |  |  |         options: WikipediaBoxOptions | 
					
						
							|  |  |  |     ): UIEventSource<{ success: string } | { error: any }> { | 
					
						
							| 
									
										
										
										
											2022-05-27 05:49:21 +02:00
										 |  |  |         const key = this.backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false) | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         const cached = Wikipedia._cache.get(key) | 
					
						
							|  |  |  |         if (cached !== undefined) { | 
					
						
							|  |  |  |             return cached | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options)) | 
					
						
							|  |  |  |         Wikipedia._cache.set(key, v) | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         return v | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     public getDataUrl(pageName: string): string { | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             `${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     public getPageUrl(pageName: string): string { | 
					
						
							| 
									
										
										
										
											2022-05-27 05:49:21 +02:00
										 |  |  |         return `${this.backend}/wiki/${pageName}` | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead | 
					
						
							|  |  |  |      * @param searchTerm | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public async search(searchTerm: string): Promise<{ title: string; snippet: string }[]> { | 
					
						
							|  |  |  |         const url = | 
					
						
							|  |  |  |             this.backend + | 
					
						
							|  |  |  |             "/w/api.php?action=query&format=json&list=search&srsearch=" + | 
					
						
							|  |  |  |             encodeURIComponent(searchTerm) | 
					
						
							|  |  |  |         return (await Utils.downloadJson(url))["query"]["search"] | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Searches via 'index.php' and scrapes the result. | 
					
						
							|  |  |  |      * This gives better results then via the API | 
					
						
							|  |  |  |      * @param searchTerm | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public async searchViaIndex( | 
					
						
							|  |  |  |         searchTerm: string | 
					
						
							|  |  |  |     ): Promise<{ title: string; snippet: string; url: string }[]> { | 
					
						
							| 
									
										
										
										
											2022-06-08 03:35:11 +02:00
										 |  |  |         const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}&ns0=1` | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         const result = await Utils.downloadAdvanced(url) | 
					
						
							|  |  |  |         if (result["redirect"]) { | 
					
						
							| 
									
										
										
										
											2022-06-08 03:35:11 +02:00
										 |  |  |             const targetUrl = result["redirect"] | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |             // This is an exact match
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |             return [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     title: this.extractPageName(targetUrl)?.trim(), | 
					
						
							|  |  |  |                     url: targetUrl, | 
					
						
							|  |  |  |                     snippet: "", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |             ] | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-12-16 13:45:07 +01:00
										 |  |  |         if (result["error"]) { | 
					
						
							|  |  |  |             throw "Could not download: " + JSON.stringify(result) | 
					
						
							| 
									
										
										
										
											2022-12-16 01:02:23 +01:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         const el = document.createElement("html") | 
					
						
							|  |  |  |         el.innerHTML = result["content"].replace(/href="\//g, 'href="' + this.backend + "/") | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         const searchResults = el.getElementsByClassName("mw-search-results") | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         const individualResults = Array.from( | 
					
						
							|  |  |  |             searchResults[0]?.getElementsByClassName("mw-search-result") ?? [] | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         return individualResults.map((result) => { | 
					
						
							| 
									
										
										
										
											2022-05-27 05:49:21 +02:00
										 |  |  |             const toRemove = Array.from(result.getElementsByClassName("searchalttitle")) | 
					
						
							|  |  |  |             for (const toRm of toRemove) { | 
					
						
							|  |  |  |                 toRm.parentElement.removeChild(toRm) | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |             return { | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |                 title: result | 
					
						
							|  |  |  |                     .getElementsByClassName("mw-search-result-heading")[0] | 
					
						
							|  |  |  |                     .textContent.trim(), | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |                 url: result.getElementsByTagName("a")[0].href, | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |                 snippet: result.getElementsByClassName("searchresult")[0].textContent, | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |             } | 
					
						
							|  |  |  |         }) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |     public async GetArticleAsync( | 
					
						
							|  |  |  |         pageName: string, | 
					
						
							|  |  |  |         options: { | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |             firstParagraphOnly?: false | boolean | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |     ): Promise<string | undefined> { | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         const response = await Utils.downloadJson(this.getDataUrl(pageName)) | 
					
						
							|  |  |  |         if (response?.parse?.text === undefined) { | 
					
						
							|  |  |  |             return undefined | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         const html = response["parse"]["text"]["*"] | 
					
						
							| 
									
										
										
										
											2022-05-26 13:23:25 +02:00
										 |  |  |         if (html === undefined) { | 
					
						
							|  |  |  |             return undefined | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  |         const div = document.createElement("div") | 
					
						
							|  |  |  |         div.innerHTML = html | 
					
						
							|  |  |  |         const content = Array.from(div.children)[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for (const forbiddenClass of Wikipedia.classesToRemove) { | 
					
						
							| 
									
										
										
										
											2021-11-07 16:34:51 +01:00
										 |  |  |             const toRemove = content.getElementsByClassName(forbiddenClass) | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  |             for (const toRemoveElement of Array.from(toRemove)) { | 
					
						
							|  |  |  |                 toRemoveElement.parentElement?.removeChild(toRemoveElement) | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-07 22:06:47 +02:00
										 |  |  |         for (const forbiddenId of Wikipedia.idsToRemove) { | 
					
						
							| 
									
										
										
										
											2021-11-07 16:34:51 +01:00
										 |  |  |             const toRemove = content.querySelector("#" + forbiddenId) | 
					
						
							| 
									
										
										
										
											2021-10-07 22:06:47 +02:00
										 |  |  |             toRemove?.parentElement?.removeChild(toRemove) | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-11-07 16:34:51 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  |         const links = Array.from(content.getElementsByTagName("a")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Rewrite relative links to absolute links + open them in a new tab
 | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  |         links | 
					
						
							|  |  |  |             .filter((link) => link.getAttribute("href")?.startsWith("/") ?? false) | 
					
						
							|  |  |  |             .forEach((link) => { | 
					
						
							|  |  |  |                 link.target = "_blank" | 
					
						
							|  |  |  |                 // note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
 | 
					
						
							|  |  |  |                 link.href = `${this.backend}${link.getAttribute("href")}` | 
					
						
							|  |  |  |             }) | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-30 00:30:15 +02:00
										 |  |  |         if (options?.firstParagraphOnly) { | 
					
						
							|  |  |  |             return content.getElementsByTagName("p").item(0).innerHTML | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-02 22:31:16 +02:00
										 |  |  |         return content.innerHTML | 
					
						
							| 
									
										
										
										
											2021-10-02 17:57:54 +02:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2022-09-08 21:40:48 +02:00
										 |  |  | } |