MapComplete/Logic/Web/Wikidata.ts

import {Utils} from "../../Utils";
import {UIEventSource} from "../UIEventSource";
import * as wds from "wikidata-sdk"

export class WikidataResponse {
    public readonly id: string
    public readonly labels: Map<string, string>
    public readonly descriptions: Map<string, string>
    public readonly claims: Map<string, Set<string>>
    public readonly wikisites: Map<string, string>
    public readonly commons: string

    constructor(
        id: string,
        labels: Map<string, string>,
        descriptions: Map<string, string>,
        claims: Map<string, Set<string>>,
        wikisites: Map<string, string>,
        commons: string
    ) {

        this.id = id
        this.labels = labels
        this.descriptions = descriptions
        this.claims = claims
        this.wikisites = wikisites
        this.commons = commons

    }

    public static fromJson(entity: any): WikidataResponse {
        const labels = new Map<string, string>()
        for (const labelName in entity.labels) {
            // The labelname is the language code
            labels.set(labelName, entity.labels[labelName].value)
        }

        const descr = new Map<string, string>()
        for (const labelName in entity.descriptions) {
            // The labelname is the language code
            descr.set(labelName, entity.descriptions[labelName].value)
        }

        const sitelinks = new Map<string, string>();
        for (const labelName in entity.sitelinks) {
            // labelName is `${language}wiki`
            const language = labelName.substring(0, labelName.length - 4)
            const title = entity.sitelinks[labelName].title
            sitelinks.set(language, title)
        }

        const commons = sitelinks.get("commons")
        sitelinks.delete("commons")
        const claims = WikidataResponse.extractClaims(entity.claims);
        return new WikidataResponse(
            entity.id,
            labels,
            descr,
            claims,
            sitelinks,
            commons
        )

    }

    static extractClaims(claimsJson: any): Map<string, Set<string>> {

        const simplified = wds.simplify.claims(claimsJson, {
            timeConverter: 'simple-day'
        })

        const claims = new Map<string, Set<string>>();
        for (const claimId in simplified) {
            const claimsList: any[] = simplified[claimId]
            claims.set(claimId, new Set(claimsList));
        }
        return claims
    }
}

export class WikidataLexeme {
    id: string
    lemma: Map<string, string>
    senses: Map<string, string>
    claims: Map<string, Set<string>>


    constructor(json) {
        this.id = json.id
        this.claims = WikidataResponse.extractClaims(json.claims)
        this.lemma = new Map<string, string>()
        for (const language in json.lemmas) {
            this.lemma.set(language, json.lemmas[language].value)
        }

        this.senses = new Map<string, string>()

        for (const sense of json.senses) {
            const glosses = sense.glosses
            for (const language in glosses) {
                let previousSenses = this.senses.get(language)
                if (previousSenses === undefined) {
                    previousSenses = ""
                } else {
                    previousSenses = previousSenses + "; "
                }
                this.senses.set(language, previousSenses + glosses[language].value ?? "")
            }
        }
    }

    asWikidataResponse() {
        return new WikidataResponse(
            this.id,
            this.lemma,
            this.senses,
            this.claims,
            new Map(),
            undefined
        );
    }
}

export interface WikidataSearchoptions {
    lang?: "en" | string,
    maxCount?: 20 | number
}

export interface WikidataAdvancedSearchoptions extends WikidataSearchoptions {
    instanceOf?: number[];
    notInstanceOf?: number[]
}


/**
 * Utility functions around wikidata
 */
export default class Wikidata {

    private static readonly _identifierPrefixes = ["Q", "L"].map(str => str.toLowerCase())
    private static readonly _prefixesToRemove = ["https://www.wikidata.org/wiki/Lexeme:",
        "https://www.wikidata.org/wiki/",
        "http://www.wikidata.org/entity/",
        "Lexeme:"].map(str => str.toLowerCase())


    private static readonly _cache = new Map<string, UIEventSource<{ success: WikidataResponse } | { error: any }>>()

    public static LoadWikidataEntry(value: string | number): UIEventSource<{ success: WikidataResponse } | { error: any }> {
        const key = this.ExtractKey(value)
        const cached = Wikidata._cache.get(key)
        if (cached !== undefined) {
            return cached
        }
        const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key))
        Wikidata._cache.set(key, src)
        return src;
    }

    /**
     * Given a search text, searches for the relevant wikidata entries, excluding pages "outside of the main tree", e.g. disambiguation pages.
     * Optionally, an 'instance of' can be given to limit the scope, e.g. instanceOf:5 (humans) will only search for humans
     */
    public static async searchAdvanced(text: string, options: WikidataAdvancedSearchoptions): Promise<{
        id: string,
        relevance?: number,
        label: string,
        description?: string
    }[]> {
        let instanceOf = ""
        if (options?.instanceOf !== undefined && options.instanceOf.length > 0) {
           const phrases = options.instanceOf.map(q => `{ ?item wdt:P31/wdt:P279* wd:Q${q}. }`)
            instanceOf = "{"+ phrases.join(" UNION ") + "}"
        }
        const forbidden = (options?.notInstanceOf ?? [])
            .concat([17379835]) // blacklist 'wikimedia pages outside of the main knowledge tree', e.g. disambiguation pages
        const minusPhrases = forbidden.map(q => `MINUS {?item wdt:P31/wdt:P279* wd:Q${q} .}`)
        const sparql = `SELECT * WHERE {
            SERVICE wikibase:mwapi {
                bd:serviceParam wikibase:api "EntitySearch" .
                bd:serviceParam wikibase:endpoint "www.wikidata.org" .
                bd:serviceParam mwapi:search "${text}" .
                bd:serviceParam mwapi:language "${options.lang}" .
                ?item wikibase:apiOutputItem mwapi:item .
                ?num wikibase:apiOrdinal true .
                bd:serviceParam wikibase:limit ${Math.round((options.maxCount ?? 20) * 1.5) /*Some padding for disambiguation pages */} .
                ?label wikibase:apiOutput mwapi:label .
                ?description wikibase:apiOutput "@description" .
            }
            ${instanceOf}
            ${minusPhrases.join("\n    ")}
        } ORDER BY ASC(?num) LIMIT ${options.maxCount ?? 20}`
        const url = wds.sparqlQuery(sparql)

        const result = await Utils.downloadJson(url)
        /*The full uri of the wikidata-item*/

        return result.results.bindings.map(({item, label, description, num}) => ({
            relevance: num?.value,
            id: item?.value,
            label: label?.value,
            description: description?.value
        }))
    }

    public static async search(
        search: string,
        options?: WikidataSearchoptions,
        page = 1
    ): Promise<{
        id: string,
        label: string,
        description: string
    }[]> {
        const maxCount = options?.maxCount ?? 20
        let pageCount = Math.min(maxCount, 50)
        const start = page * pageCount - pageCount;
        const lang = (options?.lang ?? "en")
        const url =
            "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
            search +
            "&language=" +
            lang +
            "&limit=" + pageCount + "&continue=" +
            start +
            "&format=json&uselang=" +
            lang +
            "&type=item&origin=*" +
            "&props=";// props= removes some unused values in the result
        const response = await Utils.downloadJsonCached(url, 10000)

        const result: any[] = response.search

        if (result.length < pageCount) {
            // No next page
            return result;
        }
        if (result.length < maxCount) {
            const newOptions = {...options}
            newOptions.maxCount = maxCount - result.length
            result.push(...await Wikidata.search(search,
                newOptions,
                page + 1
            ))
        }

        return result;
    }


    public static async searchAndFetch(
        search: string,
        options?: WikidataAdvancedSearchoptions
    ): Promise<WikidataResponse[]> {
        // We provide some padding to filter away invalid values
        const searchResults = await Wikidata.searchAdvanced(search, options)
        const maybeResponses = await Promise.all(
            searchResults.map(async r => {
                try {
                    console.log("Loading ", r.id)
                    return await Wikidata.LoadWikidataEntry(r.id).AsPromise()
                } catch (e) {
                    console.error(e)
                    return undefined;
                }
            }))
        return Utils.NoNull(maybeResponses.map(r => <WikidataResponse>r["success"]))
    }

    /**
     * Gets the 'key' segment from a URL
     *
     * Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072"
     * Wikidata.ExtractKey("http://www.wikidata.org/entity/Q55008046") // => "Q55008046"
     */
    public static ExtractKey(value: string | number): string {
        if (typeof value === "number") {
            return "Q" + value
        }
        if (value === undefined) {
            console.error("ExtractKey: value is undefined")
            return undefined;
        }
        value = value.trim().toLowerCase()

        for (const prefix of Wikidata._prefixesToRemove) {
            if (value.startsWith(prefix)) {
                value = value.substring(prefix.length)
            }
        }

        if (value.startsWith("http") && value === "") {
            // Probably some random link in the image field - we skip it
            return undefined
        }

        for (const identifierPrefix of Wikidata._identifierPrefixes) {
            if (value.startsWith(identifierPrefix)) {
                const trimmed = value.substring(identifierPrefix.length);
                if (trimmed === "") {
                    return undefined
                }
                const n = Number(trimmed)
                if (isNaN(n)) {
                    return undefined
                }
                return value.toUpperCase();
            }
        }

        if (value !== "" && !isNaN(Number(value))) {
            return "Q" + value
        }

        return undefined;
    }

    /**
     * Converts 'Q123' into 123, returns undefined if invalid
     *
     * Wikidata.QIdToNumber("Q123") // => 123
     * Wikidata.QIdToNumber("  Q123  ") // => 123
     *  Wikidata.QIdToNumber("  X123  ") // => undefined
     * Wikidata.QIdToNumber("  Q123X  ") // => undefined
     * Wikidata.QIdToNumber(undefined) // => undefined
     * Wikidata.QIdToNumber(123) // => 123
     */
    public static QIdToNumber(q: string | number): number | undefined {
        if(q === undefined || q === null){
            return
        }
        if(typeof q === "number"){
            return q
        }
        q = q.trim()
        if (!q.startsWith("Q")) {
            return
        }
        q = q.substr(1)
        const n = Number(q)
        if (isNaN(n)) {
            return
        }
        return n
    }

    public static IdToArticle(id: string) {
        if (id.startsWith("Q")) {
            return "https://wikidata.org/wiki/" + id
        }
        if (id.startsWith("L")) {
            return "https://wikidata.org/wiki/Lexeme:" + id
        }
        throw "Unknown id type: " + id
    }

    /**
     * Loads a wikidata page
     * @returns the entity of the given value
     */
    public static async LoadWikidataEntryAsync(value: string | number): Promise<WikidataResponse> {
        const id = Wikidata.ExtractKey(value)
        if (id === undefined) {
            console.warn("Could not extract a wikidata entry from", value)
            return undefined
        }

        const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json";
        const entities = (await Utils.downloadJsonCached(url, 10000)).entities
        const firstKey = <string>Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect
        const response = entities[firstKey]

        if (id.startsWith("L")) {
            // This is a lexeme:
            return new WikidataLexeme(response).asWikidataResponse()
        }

        return WikidataResponse.fromJson(response)
    }

}