2021-10-02 22:31:16 +02:00
|
|
|
import {Utils} from "../../Utils";
|
2021-10-03 01:38:57 +02:00
|
|
|
import {UIEventSource} from "../UIEventSource";
|
2021-10-18 20:40:24 +02:00
|
|
|
import * as wds from "wikibase-sdk"
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
export class WikidataResponse {
|
|
|
|
public readonly id: string
|
|
|
|
public readonly labels: Map<string, string>
|
|
|
|
public readonly descriptions: Map<string, string>
|
|
|
|
public readonly claims: Map<string, Set<string>>
|
|
|
|
public readonly wikisites: Map<string, string>
|
|
|
|
public readonly commons: string
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
constructor(
|
|
|
|
id: string,
|
|
|
|
labels: Map<string, string>,
|
|
|
|
descriptions: Map<string, string>,
|
|
|
|
claims: Map<string, Set<string>>,
|
|
|
|
wikisites: Map<string, string>,
|
|
|
|
commons: string
|
|
|
|
) {
|
2021-10-08 04:33:39 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
this.id = id
|
|
|
|
this.labels = labels
|
|
|
|
this.descriptions = descriptions
|
|
|
|
this.claims = claims
|
|
|
|
this.wikisites = wikisites
|
|
|
|
this.commons = commons
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
public static fromJson(entity: any): WikidataResponse {
|
2021-10-02 22:31:16 +02:00
|
|
|
const labels = new Map<string, string>()
|
|
|
|
for (const labelName in entity.labels) {
|
|
|
|
// The labelname is the language code
|
|
|
|
labels.set(labelName, entity.labels[labelName].value)
|
|
|
|
}
|
|
|
|
|
|
|
|
const descr = new Map<string, string>()
|
|
|
|
for (const labelName in entity.descriptions) {
|
|
|
|
// The labelname is the language code
|
|
|
|
descr.set(labelName, entity.descriptions[labelName].value)
|
|
|
|
}
|
|
|
|
|
|
|
|
const sitelinks = new Map<string, string>();
|
|
|
|
for (const labelName in entity.sitelinks) {
|
|
|
|
// labelName is `${language}wiki`
|
|
|
|
const language = labelName.substring(0, labelName.length - 4)
|
|
|
|
const title = entity.sitelinks[labelName].title
|
|
|
|
sitelinks.set(language, title)
|
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
2021-10-02 22:31:16 +02:00
|
|
|
const commons = sitelinks.get("commons")
|
|
|
|
sitelinks.delete("commons")
|
2021-10-09 22:40:52 +02:00
|
|
|
const claims = WikidataResponse.extractClaims(entity.claims);
|
|
|
|
return new WikidataResponse(
|
|
|
|
entity.id,
|
|
|
|
labels,
|
|
|
|
descr,
|
|
|
|
claims,
|
|
|
|
sitelinks,
|
|
|
|
commons
|
|
|
|
)
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static extractClaims(claimsJson: any): Map<string, Set<string>> {
|
2021-11-07 16:34:51 +01:00
|
|
|
|
|
|
|
const simplified = wds.simplify.claims(claimsJson, {
|
2021-10-18 20:40:24 +02:00
|
|
|
timeConverter: 'simple-day'
|
|
|
|
})
|
2021-11-07 16:34:51 +01:00
|
|
|
|
2021-10-02 22:31:16 +02:00
|
|
|
const claims = new Map<string, Set<string>>();
|
2021-10-18 20:40:24 +02:00
|
|
|
for (const claimId in simplified) {
|
|
|
|
const claimsList: any[] = simplified[claimId]
|
|
|
|
claims.set(claimId, new Set(claimsList));
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
return claims
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
export class WikidataLexeme {
|
|
|
|
id: string
|
|
|
|
lemma: Map<string, string>
|
|
|
|
senses: Map<string, string>
|
|
|
|
claims: Map<string, Set<string>>
|
|
|
|
|
|
|
|
|
|
|
|
constructor(json) {
|
|
|
|
this.id = json.id
|
|
|
|
this.claims = WikidataResponse.extractClaims(json.claims)
|
|
|
|
this.lemma = new Map<string, string>()
|
|
|
|
for (const language in json.lemmas) {
|
|
|
|
this.lemma.set(language, json.lemmas[language].value)
|
|
|
|
}
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
this.senses = new Map<string, string>()
|
|
|
|
|
|
|
|
for (const sense of json.senses) {
|
|
|
|
const glosses = sense.glosses
|
|
|
|
for (const language in glosses) {
|
2021-11-07 16:34:51 +01:00
|
|
|
let previousSenses = this.senses.get(language)
|
|
|
|
if (previousSenses === undefined) {
|
2021-10-09 22:40:52 +02:00
|
|
|
previousSenses = ""
|
2021-11-07 16:34:51 +01:00
|
|
|
} else {
|
|
|
|
previousSenses = previousSenses + "; "
|
2021-10-09 22:40:52 +02:00
|
|
|
}
|
|
|
|
this.senses.set(language, previousSenses + glosses[language].value ?? "")
|
|
|
|
}
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-09 22:40:52 +02:00
|
|
|
asWikidataResponse() {
|
|
|
|
return new WikidataResponse(
|
|
|
|
this.id,
|
|
|
|
this.lemma,
|
|
|
|
this.senses,
|
|
|
|
this.claims,
|
|
|
|
new Map(),
|
|
|
|
undefined
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
export interface WikidataSearchoptions {
|
|
|
|
lang?: "en" | string,
|
|
|
|
maxCount?: 20 | number
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Utility functions around wikidata
|
|
|
|
*/
|
|
|
|
export default class Wikidata {
|
|
|
|
|
|
|
|
private static readonly _identifierPrefixes = ["Q", "L"].map(str => str.toLowerCase())
|
|
|
|
private static readonly _prefixesToRemove = ["https://www.wikidata.org/wiki/Lexeme:", "https://www.wikidata.org/wiki/", "Lexeme:"].map(str => str.toLowerCase())
|
|
|
|
|
|
|
|
|
|
|
|
private static readonly _cache = new Map<string, UIEventSource<{ success: WikidataResponse } | { error: any }>>()
|
|
|
|
|
|
|
|
public static LoadWikidataEntry(value: string | number): UIEventSource<{ success: WikidataResponse } | { error: any }> {
|
2021-10-03 01:38:57 +02:00
|
|
|
const key = this.ExtractKey(value)
|
|
|
|
const cached = Wikidata._cache.get(key)
|
2021-10-09 22:40:52 +02:00
|
|
|
if (cached !== undefined) {
|
2021-10-03 01:38:57 +02:00
|
|
|
return cached
|
|
|
|
}
|
|
|
|
const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key))
|
|
|
|
Wikidata._cache.set(key, src)
|
|
|
|
return src;
|
|
|
|
}
|
2022-04-21 12:39:28 +02:00
|
|
|
|
|
|
|
public static async searchAdvanced(text: string, options: WikidataSearchoptions & {
|
|
|
|
instanceOf: number}){
|
|
|
|
const sparql = `SELECT * WHERE {
|
|
|
|
SERVICE wikibase:mwapi {
|
|
|
|
bd:serviceParam wikibase:api "EntitySearch" .
|
|
|
|
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
|
|
|
|
bd:serviceParam mwapi:search "${text}" .
|
|
|
|
bd:serviceParam mwapi:language "${options.lang}" .
|
|
|
|
?item wikibase:apiOutputItem mwapi:item .
|
|
|
|
?num wikibase:apiOrdinal true .
|
|
|
|
}
|
|
|
|
?item (wdt:P279|wdt:P31) wd:Q${options.instanceOf}
|
|
|
|
} ORDER BY ASC(?num) LIMIT ${options.maxCount}`
|
|
|
|
const url = wds.sparqlQuery(sparql)
|
|
|
|
|
|
|
|
const result = await Utils.downloadJson(url, {"User-Agent": "MapComplete script"})
|
|
|
|
return result.results.bindings
|
|
|
|
|
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
2021-10-08 04:33:39 +02:00
|
|
|
public static async search(
|
2021-10-09 22:40:52 +02:00
|
|
|
search: string,
|
|
|
|
options?: WikidataSearchoptions,
|
|
|
|
page = 1
|
|
|
|
): Promise<{
|
2021-10-08 04:33:39 +02:00
|
|
|
id: string,
|
|
|
|
label: string,
|
|
|
|
description: string
|
|
|
|
}[]> {
|
2021-10-09 22:40:52 +02:00
|
|
|
const maxCount = options?.maxCount ?? 20
|
|
|
|
let pageCount = Math.min(maxCount, 50)
|
|
|
|
const start = page * pageCount - pageCount;
|
|
|
|
const lang = (options?.lang ?? "en")
|
|
|
|
const url =
|
|
|
|
"https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
|
|
|
|
search +
|
|
|
|
"&language=" +
|
|
|
|
lang +
|
|
|
|
"&limit=" + pageCount + "&continue=" +
|
|
|
|
start +
|
|
|
|
"&format=json&uselang=" +
|
|
|
|
lang +
|
|
|
|
"&type=item&origin=*" +
|
|
|
|
"&props=";// props= removes some unused values in the result
|
2021-11-07 02:23:28 +01:00
|
|
|
const response = await Utils.downloadJsonCached(url, 10000)
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
const result: any[] = response.search
|
|
|
|
|
|
|
|
if (result.length < pageCount) {
|
|
|
|
// No next page
|
2021-10-08 04:33:39 +02:00
|
|
|
return result;
|
2021-10-09 22:40:52 +02:00
|
|
|
}
|
|
|
|
if (result.length < maxCount) {
|
|
|
|
const newOptions = {...options}
|
|
|
|
newOptions.maxCount = maxCount - result.length
|
|
|
|
result.push(...await Wikidata.search(search,
|
|
|
|
newOptions,
|
|
|
|
page + 1
|
|
|
|
))
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
2021-10-08 04:33:39 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
2021-11-07 16:34:51 +01:00
|
|
|
|
2021-10-08 04:33:39 +02:00
|
|
|
public static async searchAndFetch(
|
2021-10-09 22:40:52 +02:00
|
|
|
search: string,
|
|
|
|
options?: WikidataSearchoptions
|
|
|
|
): Promise<WikidataResponse[]> {
|
2021-10-08 04:33:39 +02:00
|
|
|
const maxCount = options.maxCount
|
|
|
|
// We provide some padding to filter away invalid values
|
|
|
|
options.maxCount = Math.ceil((options.maxCount ?? 20) * 1.5)
|
|
|
|
const searchResults = await Wikidata.search(search, options)
|
2021-10-09 22:40:52 +02:00
|
|
|
const maybeResponses = await Promise.all(searchResults.map(async r => {
|
|
|
|
try {
|
|
|
|
return await Wikidata.LoadWikidataEntry(r.id).AsPromise()
|
|
|
|
} catch (e) {
|
|
|
|
console.error(e)
|
|
|
|
return undefined;
|
|
|
|
}
|
2021-10-08 04:33:39 +02:00
|
|
|
}))
|
|
|
|
const responses = maybeResponses
|
2021-10-09 22:40:52 +02:00
|
|
|
.map(r => <WikidataResponse>r["success"])
|
2021-10-08 04:33:39 +02:00
|
|
|
.filter(wd => {
|
2021-10-09 22:40:52 +02:00
|
|
|
if (wd === undefined) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (wd.claims.get("P31" /*Instance of*/)?.has("Q4167410"/* Wikimedia Disambiguation page*/)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
})
|
2021-10-08 04:33:39 +02:00
|
|
|
responses.splice(maxCount, responses.length - maxCount)
|
2021-10-09 22:40:52 +02:00
|
|
|
return responses
|
2021-10-08 04:33:39 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
2022-03-14 22:57:01 +01:00
|
|
|
/**
|
|
|
|
* Gets the 'key' segment from a URL
|
|
|
|
*
|
|
|
|
* Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072"
|
|
|
|
*/
|
2021-10-09 22:40:52 +02:00
|
|
|
public static ExtractKey(value: string | number): string {
|
2021-10-02 22:31:16 +02:00
|
|
|
if (typeof value === "number") {
|
2021-10-09 22:40:52 +02:00
|
|
|
return "Q" + value
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
if (value === undefined) {
|
|
|
|
console.error("ExtractKey: value is undefined")
|
|
|
|
return undefined;
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
value = value.trim().toLowerCase()
|
|
|
|
|
|
|
|
for (const prefix of Wikidata._prefixesToRemove) {
|
|
|
|
if (value.startsWith(prefix)) {
|
|
|
|
value = value.substring(prefix.length)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (value.startsWith("http") && value === "") {
|
2021-10-02 22:31:16 +02:00
|
|
|
// Probably some random link in the image field - we skip it
|
|
|
|
return undefined
|
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
for (const identifierPrefix of Wikidata._identifierPrefixes) {
|
|
|
|
if (value.startsWith(identifierPrefix)) {
|
|
|
|
const trimmed = value.substring(identifierPrefix.length);
|
2021-11-07 16:34:51 +01:00
|
|
|
if (trimmed === "") {
|
2021-10-09 22:40:52 +02:00
|
|
|
return undefined
|
|
|
|
}
|
|
|
|
const n = Number(trimmed)
|
|
|
|
if (isNaN(n)) {
|
|
|
|
return undefined
|
|
|
|
}
|
|
|
|
return value.toUpperCase();
|
|
|
|
}
|
2021-10-03 01:38:57 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
if (value !== "" && !isNaN(Number(value))) {
|
|
|
|
return "Q" + value
|
2021-10-03 01:38:57 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
return undefined;
|
2021-10-03 01:38:57 +02:00
|
|
|
}
|
|
|
|
|
2021-11-07 16:34:51 +01:00
|
|
|
public static IdToArticle(id: string) {
|
|
|
|
if (id.startsWith("Q")) {
|
|
|
|
return "https://wikidata.org/wiki/" + id
|
2021-10-10 23:50:50 +02:00
|
|
|
}
|
2021-11-07 16:34:51 +01:00
|
|
|
if (id.startsWith("L")) {
|
|
|
|
return "https://wikidata.org/wiki/Lexeme:" + id
|
2021-10-10 23:50:50 +02:00
|
|
|
}
|
2021-11-07 16:34:51 +01:00
|
|
|
throw "Unknown id type: " + id
|
2021-10-10 23:50:50 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
2021-10-03 01:38:57 +02:00
|
|
|
/**
|
|
|
|
* Loads a wikidata page
|
|
|
|
* @returns the entity of the given value
|
|
|
|
*/
|
|
|
|
public static async LoadWikidataEntryAsync(value: string | number): Promise<WikidataResponse> {
|
|
|
|
const id = Wikidata.ExtractKey(value)
|
2021-10-09 22:40:52 +02:00
|
|
|
if (id === undefined) {
|
2021-10-03 01:38:57 +02:00
|
|
|
console.warn("Could not extract a wikidata entry from", value)
|
2021-10-09 22:40:52 +02:00
|
|
|
throw "Could not extract a wikidata entry from " + value
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json";
|
2021-11-07 02:23:28 +01:00
|
|
|
const entities = (await Utils.downloadJsonCached(url, 10000)).entities
|
2021-11-07 16:34:51 +01:00
|
|
|
const firstKey = <string>Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect
|
2021-10-13 11:34:25 +02:00
|
|
|
const response = entities[firstKey]
|
2021-10-09 22:40:52 +02:00
|
|
|
|
|
|
|
if (id.startsWith("L")) {
|
|
|
|
// This is a lexeme:
|
|
|
|
return new WikidataLexeme(response).asWikidataResponse()
|
|
|
|
}
|
|
|
|
|
|
|
|
return WikidataResponse.fromJson(response)
|
2021-10-02 22:31:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|