forked from MapComplete/MapComplete
		
	
		
			
				
	
	
		
			222 lines
		
	
	
	
		
			7.9 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
			
		
		
	
	
			222 lines
		
	
	
	
		
			7.9 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
/**
 | 
						|
 * Some usefull utility functions around the wikipedia API
 | 
						|
 */
 | 
						|
import { Utils } from "../../Utils"
 | 
						|
import { UIEventSource } from "../UIEventSource"
 | 
						|
import { WikipediaBoxOptions } from "../../UI/Wikipedia/WikipediaBox"
 | 
						|
 | 
						|
export default class Wikipedia {
 | 
						|
    /**
 | 
						|
     * When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set.
 | 
						|
     * We do this based on the classes. This set contains a blacklist of the classes to remove
 | 
						|
     * @private
 | 
						|
     */
 | 
						|
    private static readonly classesToRemove = [
 | 
						|
        "shortdescription",
 | 
						|
        "sidebar",
 | 
						|
        "infobox",
 | 
						|
        "infobox_v2",
 | 
						|
        "noprint",
 | 
						|
        "ambox",
 | 
						|
        "mw-editsection",
 | 
						|
        "mw-selflink",
 | 
						|
        "mw-empty-elt",
 | 
						|
        "hatnote", // Often redirects
 | 
						|
    ]
 | 
						|
 | 
						|
    private static readonly idsToRemove = ["sjabloon_zie"]
 | 
						|
 | 
						|
    private static readonly _cache = new Map<
 | 
						|
        string,
 | 
						|
        UIEventSource<{ success: string } | { error: any }>
 | 
						|
    >()
 | 
						|
 | 
						|
    public readonly backend: string
 | 
						|
 | 
						|
    constructor(options?: { language?: "en" | string } | { backend?: string }) {
 | 
						|
        this.backend = Wikipedia.getBackendUrl(options ?? {})
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Tries to extract the language and article name from the given string
 | 
						|
     *
 | 
						|
     * Wikipedia.extractLanguageAndName("qsdf") // => undefined
 | 
						|
     * Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
 | 
						|
     */
 | 
						|
    public static extractLanguageAndName(input: string): { language: string; pageName: string } {
 | 
						|
        const matched = input.match("([^:]+):(.*)")
 | 
						|
        if (matched === undefined || matched === null) {
 | 
						|
            return undefined
 | 
						|
        }
 | 
						|
        const [_, language, pageName] = matched
 | 
						|
        return {
 | 
						|
            language,
 | 
						|
            pageName,
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
 | 
						|
     *
 | 
						|
     * new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
 | 
						|
     * new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
 | 
						|
     */
 | 
						|
    public extractPageName(input: string): string | undefined {
 | 
						|
        if (!input.startsWith(this.backend)) {
 | 
						|
            return undefined
 | 
						|
        }
 | 
						|
        input = input.substring(this.backend.length)
 | 
						|
 | 
						|
        const matched = input.match("/?wiki/(.+)")
 | 
						|
        if (matched === undefined || matched === null) {
 | 
						|
            return undefined
 | 
						|
        }
 | 
						|
        const [_, pageName] = matched
 | 
						|
        return pageName
 | 
						|
    }
 | 
						|
 | 
						|
    private static getBackendUrl(
 | 
						|
        options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }
 | 
						|
    ): string {
 | 
						|
        let backend = "en.wikipedia.org"
 | 
						|
        if (options["backend"]) {
 | 
						|
            backend = options["backend"]
 | 
						|
        } else if (options["language"]) {
 | 
						|
            backend = `${options["language"] ?? "en"}.wikipedia.org`
 | 
						|
        }
 | 
						|
        if (!backend.startsWith("http")) {
 | 
						|
            backend = "https://" + backend
 | 
						|
        }
 | 
						|
        return backend
 | 
						|
    }
 | 
						|
 | 
						|
    public GetArticle(
 | 
						|
        pageName: string,
 | 
						|
        options: WikipediaBoxOptions
 | 
						|
    ): UIEventSource<{ success: string } | { error: any }> {
 | 
						|
        const key = this.backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false)
 | 
						|
        const cached = Wikipedia._cache.get(key)
 | 
						|
        if (cached !== undefined) {
 | 
						|
            return cached
 | 
						|
        }
 | 
						|
        const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options))
 | 
						|
        Wikipedia._cache.set(key, v)
 | 
						|
        return v
 | 
						|
    }
 | 
						|
 | 
						|
    public getDataUrl(pageName: string): string {
 | 
						|
        return (
 | 
						|
            `${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
 | 
						|
        )
 | 
						|
    }
 | 
						|
 | 
						|
    public getPageUrl(pageName: string): string {
 | 
						|
        return `${this.backend}/wiki/${pageName}`
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
 | 
						|
     * @param searchTerm
 | 
						|
     */
 | 
						|
    public async search(searchTerm: string): Promise<{ title: string; snippet: string }[]> {
 | 
						|
        const url =
 | 
						|
            this.backend +
 | 
						|
            "/w/api.php?action=query&format=json&list=search&srsearch=" +
 | 
						|
            encodeURIComponent(searchTerm)
 | 
						|
        return (await Utils.downloadJson(url))["query"]["search"]
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Searches via 'index.php' and scrapes the result.
 | 
						|
     * This gives better results then via the API
 | 
						|
     * @param searchTerm
 | 
						|
     */
 | 
						|
    public async searchViaIndex(
 | 
						|
        searchTerm: string
 | 
						|
    ): Promise<{ title: string; snippet: string; url: string }[]> {
 | 
						|
        const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}&ns0=1`
 | 
						|
        const result = await Utils.downloadAdvanced(url)
 | 
						|
        if (result["redirect"]) {
 | 
						|
            const targetUrl = result["redirect"]
 | 
						|
            // This is an exact match
 | 
						|
            return [
 | 
						|
                {
 | 
						|
                    title: this.extractPageName(targetUrl)?.trim(),
 | 
						|
                    url: targetUrl,
 | 
						|
                    snippet: "",
 | 
						|
                },
 | 
						|
            ]
 | 
						|
        }
 | 
						|
        if (result["error"]) {
 | 
						|
            throw "Could not download: " + JSON.stringify(result)
 | 
						|
        }
 | 
						|
        const el = document.createElement("html")
 | 
						|
        el.innerHTML = result["content"].replace(/href="\//g, 'href="' + this.backend + "/")
 | 
						|
        const searchResults = el.getElementsByClassName("mw-search-results")
 | 
						|
        const individualResults = Array.from(
 | 
						|
            searchResults[0]?.getElementsByClassName("mw-search-result") ?? []
 | 
						|
        )
 | 
						|
        return individualResults.map((result) => {
 | 
						|
            const toRemove = Array.from(result.getElementsByClassName("searchalttitle"))
 | 
						|
            for (const toRm of toRemove) {
 | 
						|
                toRm.parentElement.removeChild(toRm)
 | 
						|
            }
 | 
						|
 | 
						|
            return {
 | 
						|
                title: result
 | 
						|
                    .getElementsByClassName("mw-search-result-heading")[0]
 | 
						|
                    .textContent.trim(),
 | 
						|
                url: result.getElementsByTagName("a")[0].href,
 | 
						|
                snippet: result.getElementsByClassName("searchresult")[0].textContent,
 | 
						|
            }
 | 
						|
        })
 | 
						|
    }
 | 
						|
 | 
						|
    public async GetArticleAsync(
 | 
						|
        pageName: string,
 | 
						|
        options: {
 | 
						|
            firstParagraphOnly?: false | boolean
 | 
						|
        }
 | 
						|
    ): Promise<string | undefined> {
 | 
						|
        const response = await Utils.downloadJson(this.getDataUrl(pageName))
 | 
						|
        if (response?.parse?.text === undefined) {
 | 
						|
            return undefined
 | 
						|
        }
 | 
						|
        const html = response["parse"]["text"]["*"]
 | 
						|
        if (html === undefined) {
 | 
						|
            return undefined
 | 
						|
        }
 | 
						|
        const div = document.createElement("div")
 | 
						|
        div.innerHTML = html
 | 
						|
        const content = Array.from(div.children)[0]
 | 
						|
 | 
						|
        for (const forbiddenClass of Wikipedia.classesToRemove) {
 | 
						|
            const toRemove = content.getElementsByClassName(forbiddenClass)
 | 
						|
            for (const toRemoveElement of Array.from(toRemove)) {
 | 
						|
                toRemoveElement.parentElement?.removeChild(toRemoveElement)
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        for (const forbiddenId of Wikipedia.idsToRemove) {
 | 
						|
            const toRemove = content.querySelector("#" + forbiddenId)
 | 
						|
            toRemove?.parentElement?.removeChild(toRemove)
 | 
						|
        }
 | 
						|
 | 
						|
        const links = Array.from(content.getElementsByTagName("a"))
 | 
						|
 | 
						|
        // Rewrite relative links to absolute links + open them in a new tab
 | 
						|
        links
 | 
						|
            .filter((link) => link.getAttribute("href")?.startsWith("/") ?? false)
 | 
						|
            .forEach((link) => {
 | 
						|
                link.target = "_blank"
 | 
						|
                // note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
 | 
						|
                link.href = `${this.backend}${link.getAttribute("href")}`
 | 
						|
            })
 | 
						|
 | 
						|
        if (options?.firstParagraphOnly) {
 | 
						|
            return content.getElementsByTagName("p").item(0).innerHTML
 | 
						|
        }
 | 
						|
 | 
						|
        return content.innerHTML
 | 
						|
    }
 | 
						|
}
 |