MapComplete/Logic/Web/Wikidata.ts

328 lines
11 KiB
TypeScript
Raw Normal View History

2021-10-02 22:31:16 +02:00
import {Utils} from "../../Utils";
2021-10-03 01:38:57 +02:00
import {UIEventSource} from "../UIEventSource";
import * as wds from "wikibase-sdk"
2021-10-02 22:31:16 +02:00
export class WikidataResponse {
public readonly id: string
public readonly labels: Map<string, string>
public readonly descriptions: Map<string, string>
public readonly claims: Map<string, Set<string>>
public readonly wikisites: Map<string, string>
public readonly commons: string
2021-10-02 22:31:16 +02:00
constructor(
id: string,
labels: Map<string, string>,
descriptions: Map<string, string>,
claims: Map<string, Set<string>>,
wikisites: Map<string, string>,
commons: string
) {
2021-10-08 04:33:39 +02:00
this.id = id
this.labels = labels
this.descriptions = descriptions
this.claims = claims
this.wikisites = wikisites
this.commons = commons
2021-10-02 22:31:16 +02:00
}
public static fromJson(entity: any): WikidataResponse {
2021-10-02 22:31:16 +02:00
const labels = new Map<string, string>()
for (const labelName in entity.labels) {
// The labelname is the language code
labels.set(labelName, entity.labels[labelName].value)
}
const descr = new Map<string, string>()
for (const labelName in entity.descriptions) {
// The labelname is the language code
descr.set(labelName, entity.descriptions[labelName].value)
}
const sitelinks = new Map<string, string>();
for (const labelName in entity.sitelinks) {
// labelName is `${language}wiki`
const language = labelName.substring(0, labelName.length - 4)
const title = entity.sitelinks[labelName].title
sitelinks.set(language, title)
}
2021-10-02 22:31:16 +02:00
const commons = sitelinks.get("commons")
sitelinks.delete("commons")
const claims = WikidataResponse.extractClaims(entity.claims);
return new WikidataResponse(
entity.id,
labels,
descr,
claims,
sitelinks,
commons
)
2021-10-02 22:31:16 +02:00
}
static extractClaims(claimsJson: any): Map<string, Set<string>> {
2021-11-07 16:34:51 +01:00
const simplified = wds.simplify.claims(claimsJson, {
timeConverter: 'simple-day'
})
2021-11-07 16:34:51 +01:00
2021-10-02 22:31:16 +02:00
const claims = new Map<string, Set<string>>();
for (const claimId in simplified) {
const claimsList: any[] = simplified[claimId]
claims.set(claimId, new Set(claimsList));
2021-10-02 22:31:16 +02:00
}
return claims
}
}
export class WikidataLexeme {
id: string
lemma: Map<string, string>
senses: Map<string, string>
claims: Map<string, Set<string>>
constructor(json) {
this.id = json.id
this.claims = WikidataResponse.extractClaims(json.claims)
this.lemma = new Map<string, string>()
for (const language in json.lemmas) {
this.lemma.set(language, json.lemmas[language].value)
}
2021-10-02 22:31:16 +02:00
this.senses = new Map<string, string>()
for (const sense of json.senses) {
const glosses = sense.glosses
for (const language in glosses) {
2021-11-07 16:34:51 +01:00
let previousSenses = this.senses.get(language)
if (previousSenses === undefined) {
previousSenses = ""
2021-11-07 16:34:51 +01:00
} else {
previousSenses = previousSenses + "; "
}
this.senses.set(language, previousSenses + glosses[language].value ?? "")
}
2021-10-02 22:31:16 +02:00
}
}
asWikidataResponse() {
return new WikidataResponse(
this.id,
this.lemma,
this.senses,
this.claims,
new Map(),
undefined
);
}
}
export interface WikidataSearchoptions {
lang?: "en" | string,
maxCount?: 20 | number
}
/**
* Utility functions around wikidata
*/
export default class Wikidata {
private static readonly _identifierPrefixes = ["Q", "L"].map(str => str.toLowerCase())
private static readonly _prefixesToRemove = ["https://www.wikidata.org/wiki/Lexeme:", "https://www.wikidata.org/wiki/", "Lexeme:"].map(str => str.toLowerCase())
private static readonly _cache = new Map<string, UIEventSource<{ success: WikidataResponse } | { error: any }>>()
public static LoadWikidataEntry(value: string | number): UIEventSource<{ success: WikidataResponse } | { error: any }> {
2021-10-03 01:38:57 +02:00
const key = this.ExtractKey(value)
const cached = Wikidata._cache.get(key)
if (cached !== undefined) {
2021-10-03 01:38:57 +02:00
return cached
}
const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key))
Wikidata._cache.set(key, src)
return src;
}
public static async searchAdvanced(text: string, options: WikidataSearchoptions & {
instanceOf: number}){
const sparql = `SELECT * WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" .
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
bd:serviceParam mwapi:search "${text}" .
bd:serviceParam mwapi:language "${options.lang}" .
?item wikibase:apiOutputItem mwapi:item .
?num wikibase:apiOrdinal true .
}
?item (wdt:P279|wdt:P31) wd:Q${options.instanceOf}
} ORDER BY ASC(?num) LIMIT ${options.maxCount}`
const url = wds.sparqlQuery(sparql)
const result = await Utils.downloadJson(url, {"User-Agent": "MapComplete script"})
return result.results.bindings
}
2021-10-08 04:33:39 +02:00
public static async search(
search: string,
options?: WikidataSearchoptions,
page = 1
): Promise<{
2021-10-08 04:33:39 +02:00
id: string,
label: string,
description: string
}[]> {
const maxCount = options?.maxCount ?? 20
let pageCount = Math.min(maxCount, 50)
const start = page * pageCount - pageCount;
const lang = (options?.lang ?? "en")
const url =
"https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
search +
"&language=" +
lang +
"&limit=" + pageCount + "&continue=" +
start +
"&format=json&uselang=" +
lang +
"&type=item&origin=*" +
"&props=";// props= removes some unused values in the result
const response = await Utils.downloadJsonCached(url, 10000)
const result: any[] = response.search
if (result.length < pageCount) {
// No next page
2021-10-08 04:33:39 +02:00
return result;
}
if (result.length < maxCount) {
const newOptions = {...options}
newOptions.maxCount = maxCount - result.length
result.push(...await Wikidata.search(search,
newOptions,
page + 1
))
}
return result;
2021-10-08 04:33:39 +02:00
}
2021-11-07 16:34:51 +01:00
2021-10-08 04:33:39 +02:00
public static async searchAndFetch(
search: string,
options?: WikidataSearchoptions
): Promise<WikidataResponse[]> {
2021-10-08 04:33:39 +02:00
const maxCount = options.maxCount
// We provide some padding to filter away invalid values
options.maxCount = Math.ceil((options.maxCount ?? 20) * 1.5)
const searchResults = await Wikidata.search(search, options)
const maybeResponses = await Promise.all(searchResults.map(async r => {
try {
return await Wikidata.LoadWikidataEntry(r.id).AsPromise()
} catch (e) {
console.error(e)
return undefined;
}
2021-10-08 04:33:39 +02:00
}))
const responses = maybeResponses
.map(r => <WikidataResponse>r["success"])
2021-10-08 04:33:39 +02:00
.filter(wd => {
if (wd === undefined) {
return false;
}
if (wd.claims.get("P31" /*Instance of*/)?.has("Q4167410"/* Wikimedia Disambiguation page*/)) {
return false;
}
return true;
})
2021-10-08 04:33:39 +02:00
responses.splice(maxCount, responses.length - maxCount)
return responses
2021-10-08 04:33:39 +02:00
}
/**
* Gets the 'key' segment from a URL
*
* Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072"
*/
public static ExtractKey(value: string | number): string {
2021-10-02 22:31:16 +02:00
if (typeof value === "number") {
return "Q" + value
2021-10-02 22:31:16 +02:00
}
if (value === undefined) {
console.error("ExtractKey: value is undefined")
return undefined;
2021-10-02 22:31:16 +02:00
}
value = value.trim().toLowerCase()
for (const prefix of Wikidata._prefixesToRemove) {
if (value.startsWith(prefix)) {
value = value.substring(prefix.length)
}
}
if (value.startsWith("http") && value === "") {
2021-10-02 22:31:16 +02:00
// Probably some random link in the image field - we skip it
return undefined
}
for (const identifierPrefix of Wikidata._identifierPrefixes) {
if (value.startsWith(identifierPrefix)) {
const trimmed = value.substring(identifierPrefix.length);
2021-11-07 16:34:51 +01:00
if (trimmed === "") {
return undefined
}
const n = Number(trimmed)
if (isNaN(n)) {
return undefined
}
return value.toUpperCase();
}
2021-10-03 01:38:57 +02:00
}
if (value !== "" && !isNaN(Number(value))) {
return "Q" + value
2021-10-03 01:38:57 +02:00
}
return undefined;
2021-10-03 01:38:57 +02:00
}
2021-11-07 16:34:51 +01:00
public static IdToArticle(id: string) {
if (id.startsWith("Q")) {
return "https://wikidata.org/wiki/" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
if (id.startsWith("L")) {
return "https://wikidata.org/wiki/Lexeme:" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
throw "Unknown id type: " + id
2021-10-10 23:50:50 +02:00
}
2021-10-03 01:38:57 +02:00
/**
* Loads a wikidata page
* @returns the entity of the given value
*/
public static async LoadWikidataEntryAsync(value: string | number): Promise<WikidataResponse> {
const id = Wikidata.ExtractKey(value)
if (id === undefined) {
2021-10-03 01:38:57 +02:00
console.warn("Could not extract a wikidata entry from", value)
throw "Could not extract a wikidata entry from " + value
2021-10-02 22:31:16 +02:00
}
const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json";
const entities = (await Utils.downloadJsonCached(url, 10000)).entities
2021-11-07 16:34:51 +01:00
const firstKey = <string>Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect
2021-10-13 11:34:25 +02:00
const response = entities[firstKey]
if (id.startsWith("L")) {
// This is a lexeme:
return new WikidataLexeme(response).asWikidataResponse()
}
return WikidataResponse.fromJson(response)
2021-10-02 22:31:16 +02:00
}
}