Refactoring: move all code files into a src directory

2023-07-09 13:09:05 +02:00 · 2023-07-09 13:09:05 +02:00 · e75d2789d2
commit e75d2789d2
parent de99f56ca8
389 changed files with 0 additions and 12 deletions
--- a/src/Logic/Web/Wikipedia.ts
+++ b/src/Logic/Web/Wikipedia.ts
@ -0,0 +1,298 @@
+import { Utils } from "../../Utils"
+import Wikidata, { WikidataResponse } from "./Wikidata"
+import { Store, UIEventSource } from "../UIEventSource"
+
+export interface FullWikipediaDetails {
+    articleUrl?: string
+    language?: string
+    pagename?: string
+    fullArticle?: string
+    firstParagraph?: string
+    restOfArticle?: string
+    wikidata?: WikidataResponse
+    title?: string
+}
+
+export default class Wikipedia {
+    /**
+     * When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set.
+     * We do this based on the classes. This set contains a blacklist of the classes to remove
+     * @private
+     */
+    private static readonly classesToRemove = [
+        "shortdescription",
+        "sidebar",
+        "infobox",
+        "infobox_v2",
+        "noprint",
+        "ambox",
+        "mw-editsection",
+        "mw-selflink",
+        "mw-empty-elt",
+        "hatnote", // Often redirects
+    ]
+
+    private static readonly idsToRemove = ["sjabloon_zie"]
+
+    private static readonly _cache = new Map<string, Promise<string>>()
+    private static _fullDetailsCache = new Map<string, Store<FullWikipediaDetails>>()
+    public readonly backend: string
+
+    constructor(options?: { language?: "en" | string } | { backend?: string }) {
+        this.backend = Wikipedia.getBackendUrl(options ?? {})
+    }
+
+    /**
+     * Tries to extract the language and article name from the given string
+     *
+     * Wikipedia.extractLanguageAndName("qsdf") // => undefined
+     * Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
+     */
+    public static extractLanguageAndName(input: string): { language: string; pageName: string } {
+        const matched = input.match("([^:]+):(.*)")
+        if (matched === undefined || matched === null) {
+            return undefined
+        }
+        const [_, language, pageName] = matched
+        return {
+            language,
+            pageName,
+        }
+    }
+
+    /**
+     * Fetch all useful information for the given entity.
+     *
+     */
+    public static fetchArticleAndWikidata(
+        wikidataOrPageId: string,
+        preferedLanguage: string
+    ): Store<FullWikipediaDetails> {
+        const cachekey = preferedLanguage + wikidataOrPageId
+        const cached = Wikipedia._fullDetailsCache.get(cachekey)
+        if (cached) {
+            return cached
+        }
+        console.log("Constructing store for", cachekey)
+        const store = new UIEventSource<FullWikipediaDetails>({}, cachekey)
+        Wikipedia._fullDetailsCache.set(cachekey, store)
+
+        // Are we dealing with a wikidata item?
+        const wikidataId = Wikidata.ExtractKey(wikidataOrPageId)
+        if (!wikidataId) {
+            // We are dealing with a wikipedia identifier, e.g. 'NL:articlename', 'https://nl.wikipedia.org/wiki/article', ...
+            const { language, pageName } = Wikipedia.extractLanguageAndName(wikidataOrPageId)
+            store.data.articleUrl = new Wikipedia({ language }).getPageUrl(pageName)
+            store.data.language = language
+            store.data.pagename = pageName
+            store.data.title = pageName
+        } else {
+            // Jup, this is a wikidata item
+            // Lets fetch the wikidata
+            store.data.title = wikidataId
+            Wikidata.LoadWikidataEntryAsync(wikidataId).then((wikidata) => {
+                store.data.wikidata = wikidata
+                store.ping()
+                // With the wikidata, we can search for the appropriate wikipedia page
+                const preferredLanguage = [
+                    preferedLanguage,
+                    "en",
+                    Array.from(wikidata.wikisites.keys())[0],
+                ]
+
+                for (const language of preferredLanguage) {
+                    const pagetitle = wikidata.wikisites.get(language)
+                    if (pagetitle) {
+                        store.data.articleUrl = new Wikipedia({ language }).getPageUrl(pagetitle)
+                        store.data.pagename = pagetitle
+                        store.data.language = language
+                        store.data.title = pagetitle
+                        store.ping()
+                        break
+                    }
+                }
+            })
+        }
+
+        // Now that the pageURL has been setup, we can focus on downloading the actual article
+        // We setup a listener. As soon as the article-URL is know, we'll fetch the actual page
+        // This url can either be set by the Wikidata-response or directly if we are dealing with a wikipedia-url
+        store.addCallbackAndRun((data) => {
+            if (data.language === undefined || data.pagename === undefined) {
+                return
+            }
+            const wikipedia = new Wikipedia({ language: data.language })
+            wikipedia.GetArticleHtml(data.pagename).then((article) => {
+                data.fullArticle = article
+                const content = document.createElement("div")
+                content.innerHTML = article
+                const firstParagraph = content.getElementsByTagName("p").item(0)
+                data.firstParagraph = firstParagraph.innerHTML
+                content.removeChild(firstParagraph)
+                data.restOfArticle = content.innerHTML
+                store.ping()
+            })
+            return true // unregister
+        })
+
+        return store
+    }
+
+    private static getBackendUrl(
+        options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }
+    ): string {
+        let backend = "en.wikipedia.org"
+        if (options["backend"]) {
+            backend = options["backend"]
+        } else if (options["language"]) {
+            backend = `${options["language"] ?? "en"}.wikipedia.org`
+        }
+        if (!backend.startsWith("http")) {
+            backend = "https://" + backend
+        }
+        return backend
+    }
+
+    /**
+     * Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
+     *
+     * new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
+     * new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
+     */
+    public extractPageName(input: string): string | undefined {
+        if (!input.startsWith(this.backend)) {
+            return undefined
+        }
+        input = input.substring(this.backend.length)
+
+        const matched = input.match("/?wiki/(.+)")
+        if (matched === undefined || matched === null) {
+            return undefined
+        }
+        const [_, pageName] = matched
+        return pageName
+    }
+
+    public getDataUrl(pageName: string): string {
+        return (
+            `${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
+        )
+    }
+
+    public getPageUrl(pageName: string): string {
+        return `${this.backend}/wiki/${pageName}`
+    }
+
+    /**
+     * Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
+     * @param searchTerm
+     */
+    public async search(searchTerm: string): Promise<{ title: string; snippet: string }[]> {
+        const url =
+            this.backend +
+            "/w/api.php?action=query&format=json&list=search&srsearch=" +
+            encodeURIComponent(searchTerm)
+        return (await Utils.downloadJson(url))["query"]["search"]
+    }
+
+    /**
+     * Searches via 'index.php' and scrapes the result.
+     * This gives better results then via the API
+     * @param searchTerm
+     */
+    public async searchViaIndex(
+        searchTerm: string
+    ): Promise<{ title: string; snippet: string; url: string }[]> {
+        const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}&ns0=1`
+        const result = await Utils.downloadAdvanced(url)
+        if (result["redirect"]) {
+            const targetUrl = result["redirect"]
+            // This is an exact match
+            return [
+                {
+                    title: this.extractPageName(targetUrl)?.trim(),
+                    url: targetUrl,
+                    snippet: "",
+                },
+            ]
+        }
+        if (result["error"]) {
+            throw "Could not download: " + JSON.stringify(result)
+        }
+        const el = document.createElement("html")
+        el.innerHTML = result["content"].replace(/href="\//g, 'href="' + this.backend + "/")
+        const searchResults = el.getElementsByClassName("mw-search-results")
+        const individualResults = Array.from(
+            searchResults[0]?.getElementsByClassName("mw-search-result") ?? []
+        )
+        return individualResults.map((result) => {
+            const toRemove = Array.from(result.getElementsByClassName("searchalttitle"))
+            for (const toRm of toRemove) {
+                toRm.parentElement.removeChild(toRm)
+            }
+
+            return {
+                title: result
+                    .getElementsByClassName("mw-search-result-heading")[0]
+                    .textContent.trim(),
+                url: result.getElementsByTagName("a")[0].href,
+                snippet: result.getElementsByClassName("searchresult")[0].textContent,
+            }
+        })
+    }
+
+    /**
+     * Returns the innerHTML for the given article as string.
+     * Some cleanup is applied to this.
+     *
+     * This method uses a static, local cache, so each article will be retrieved only once via the network
+     */
+    public GetArticleHtml(pageName: string): Promise<string> {
+        const cacheKey = this.backend + "/" + pageName
+        if (Wikipedia._cache.has(cacheKey)) {
+            return Wikipedia._cache.get(cacheKey)
+        }
+        const promise = this.GetArticleUncachedAsync(pageName)
+        Wikipedia._cache.set(cacheKey, promise)
+        return promise
+    }
+
+    private async GetArticleUncachedAsync(pageName: string): Promise<string> {
+        const response = await Utils.downloadJson(this.getDataUrl(pageName))
+        if (response?.parse?.text === undefined) {
+            return undefined
+        }
+        const html = response["parse"]["text"]["*"]
+        if (html === undefined) {
+            return undefined
+        }
+        const div = document.createElement("div")
+        div.innerHTML = html
+        const content = Array.from(div.children)[0]
+
+        for (const forbiddenClass of Wikipedia.classesToRemove) {
+            const toRemove = content.getElementsByClassName(forbiddenClass)
+            for (const toRemoveElement of Array.from(toRemove)) {
+                toRemoveElement.parentElement?.removeChild(toRemoveElement)
+            }
+        }
+
+        for (const forbiddenId of Wikipedia.idsToRemove) {
+            const toRemove = content.querySelector("#" + forbiddenId)
+            toRemove?.parentElement?.removeChild(toRemove)
+        }
+
+        const links = Array.from(content.getElementsByTagName("a"))
+
+        // Rewrite relative links to absolute links + open them in a new tab
+        links
+            .filter((link) => link.getAttribute("href")?.startsWith("/") ?? false)
+            .forEach((link) => {
+                link.target = "_blank"
+                // note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
+                link.href = `${this.backend}${link.getAttribute("href")}`
+            })
+
+        return content.innerHTML
+    }
+}