MapComplete/Logic/Web/Wikipedia.ts

220 lines
7.8 KiB
TypeScript
Raw Normal View History

2021-10-02 17:57:54 +02:00
/**
* Some usefull utility functions around the wikipedia API
*/
2022-09-08 21:40:48 +02:00
import { Utils } from "../../Utils"
import { UIEventSource } from "../UIEventSource"
import { WikipediaBoxOptions } from "../../UI/Wikipedia/WikipediaBox"
2021-10-02 17:57:54 +02:00
export default class Wikipedia {
/**
* When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set.
* We do this based on the classes. This set contains a blacklist of the classes to remove
* @private
*/
private static readonly classesToRemove = [
"shortdescription",
"sidebar",
2022-09-08 21:40:48 +02:00
"infobox",
"infobox_v2",
2021-10-02 22:31:16 +02:00
"noprint",
"ambox",
2021-10-02 17:57:54 +02:00
"mw-editsection",
2021-10-02 22:31:16 +02:00
"mw-selflink",
"mw-empty-elt",
2022-09-08 21:40:48 +02:00
"hatnote", // Often redirects
]
2021-10-02 17:57:54 +02:00
2022-09-08 21:40:48 +02:00
private static readonly idsToRemove = ["sjabloon_zie"]
2021-11-07 16:34:51 +01:00
2022-09-08 21:40:48 +02:00
private static readonly _cache = new Map<
string,
UIEventSource<{ success: string } | { error: any }>
>()
2021-11-07 16:34:51 +01:00
2022-09-08 21:40:48 +02:00
public readonly backend: string
2022-04-30 00:30:15 +02:00
2022-09-08 21:40:48 +02:00
constructor(options?: { language?: "en" | string } | { backend?: string }) {
this.backend = Wikipedia.getBackendUrl(options ?? {})
2022-04-30 00:30:15 +02:00
}
/**
* Tries to extract the language and article name from the given string
*
* Wikipedia.extractLanguageAndName("qsdf") // => undefined
2022-05-01 21:05:58 +02:00
* Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
*/
2022-09-08 21:40:48 +02:00
public static extractLanguageAndName(input: string): { language: string; pageName: string } {
const matched = input.match("([^:]+):(.*)")
if (matched === undefined || matched === null) {
return undefined
}
const [_, language, pageName] = matched
return {
2022-09-08 21:40:48 +02:00
language,
pageName,
}
}
2021-10-02 17:57:54 +02:00
/**
* Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
2022-09-08 21:40:48 +02:00
*
* new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
* new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
*/
2022-09-08 21:40:48 +02:00
public extractPageName(input: string): string | undefined {
if (!input.startsWith(this.backend)) {
return undefined
}
2022-09-08 21:40:48 +02:00
input = input.substring(this.backend.length)
const matched = input.match("/?wiki/(.+)")
if (matched === undefined || matched === null) {
return undefined
}
const [_, pageName] = matched
return pageName
}
2022-09-08 21:40:48 +02:00
private static getBackendUrl(
options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }
): string {
let backend = "en.wikipedia.org"
if (options["backend"]) {
backend = options["backend"]
} else if (options["language"]) {
backend = `${options["language"] ?? "en"}.wikipedia.org`
}
if (!backend.startsWith("http")) {
backend = "https://" + backend
}
return backend
}
2022-09-08 21:40:48 +02:00
public GetArticle(
pageName: string,
options: WikipediaBoxOptions
): UIEventSource<{ success: string } | { error: any }> {
const key = this.backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false)
const cached = Wikipedia._cache.get(key)
if (cached !== undefined) {
return cached
}
const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options))
Wikipedia._cache.set(key, v)
2022-09-08 21:40:48 +02:00
return v
}
public getDataUrl(pageName: string): string {
2022-09-08 21:40:48 +02:00
return (
`${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
)
}
public getPageUrl(pageName: string): string {
return `${this.backend}/wiki/${pageName}`
}
/**
* Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
* @param searchTerm
*/
2022-09-08 21:40:48 +02:00
public async search(searchTerm: string): Promise<{ title: string; snippet: string }[]> {
const url =
this.backend +
"/w/api.php?action=query&format=json&list=search&srsearch=" +
encodeURIComponent(searchTerm)
return (await Utils.downloadJson(url))["query"]["search"]
}
/**
* Searches via 'index.php' and scrapes the result.
* This gives better results then via the API
* @param searchTerm
*/
2022-09-08 21:40:48 +02:00
public async searchViaIndex(
searchTerm: string
): Promise<{ title: string; snippet: string; url: string }[]> {
const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}&ns0=1`
2022-09-08 21:40:48 +02:00
const result = await Utils.downloadAdvanced(url)
if (result["redirect"]) {
const targetUrl = result["redirect"]
// This is an exact match
2022-09-08 21:40:48 +02:00
return [
{
title: this.extractPageName(targetUrl)?.trim(),
url: targetUrl,
snippet: "",
},
]
}
2022-09-08 21:40:48 +02:00
const el = document.createElement("html")
el.innerHTML = result["content"].replace(/href="\//g, 'href="' + this.backend + "/")
const searchResults = el.getElementsByClassName("mw-search-results")
2022-09-08 21:40:48 +02:00
const individualResults = Array.from(
searchResults[0]?.getElementsByClassName("mw-search-result") ?? []
)
return individualResults.map((result) => {
const toRemove = Array.from(result.getElementsByClassName("searchalttitle"))
for (const toRm of toRemove) {
toRm.parentElement.removeChild(toRm)
}
2022-09-08 21:40:48 +02:00
return {
2022-09-08 21:40:48 +02:00
title: result
.getElementsByClassName("mw-search-result-heading")[0]
.textContent.trim(),
url: result.getElementsByTagName("a")[0].href,
2022-09-08 21:40:48 +02:00
snippet: result.getElementsByClassName("searchresult")[0].textContent,
}
})
}
2022-09-08 21:40:48 +02:00
public async GetArticleAsync(
pageName: string,
options: {
firstParagraphOnly?: false | boolean
2022-09-08 21:40:48 +02:00
}
): Promise<string | undefined> {
const response = await Utils.downloadJson(this.getDataUrl(pageName))
if (response?.parse?.text === undefined) {
return undefined
}
2022-09-08 21:40:48 +02:00
const html = response["parse"]["text"]["*"]
if (html === undefined) {
return undefined
}
2021-10-02 17:57:54 +02:00
const div = document.createElement("div")
div.innerHTML = html
const content = Array.from(div.children)[0]
for (const forbiddenClass of Wikipedia.classesToRemove) {
2021-11-07 16:34:51 +01:00
const toRemove = content.getElementsByClassName(forbiddenClass)
2021-10-02 17:57:54 +02:00
for (const toRemoveElement of Array.from(toRemove)) {
toRemoveElement.parentElement?.removeChild(toRemoveElement)
}
}
2021-10-02 22:31:16 +02:00
for (const forbiddenId of Wikipedia.idsToRemove) {
2021-11-07 16:34:51 +01:00
const toRemove = content.querySelector("#" + forbiddenId)
toRemove?.parentElement?.removeChild(toRemove)
}
2021-11-07 16:34:51 +01:00
2021-10-02 22:31:16 +02:00
const links = Array.from(content.getElementsByTagName("a"))
// Rewrite relative links to absolute links + open them in a new tab
2022-09-08 21:40:48 +02:00
links
.filter((link) => link.getAttribute("href")?.startsWith("/") ?? false)
.forEach((link) => {
link.target = "_blank"
// note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
link.href = `${this.backend}${link.getAttribute("href")}`
})
2021-10-02 22:31:16 +02:00
2022-04-30 00:30:15 +02:00
if (options?.firstParagraphOnly) {
return content.getElementsByTagName("p").item(0).innerHTML
}
2021-10-02 22:31:16 +02:00
return content.innerHTML
2021-10-02 17:57:54 +02:00
}
2022-09-08 21:40:48 +02:00
}