Cleanup of wikipedia and download functions

This commit is contained in:
Pieter Vander Vennet 2022-05-26 13:23:25 +02:00
parent 9bedf8e681
commit 48953cf266
4 changed files with 167 additions and 84 deletions

View file

@ -3,6 +3,7 @@
*/
import {Utils} from "../../Utils";
import {UIEventSource} from "../UIEventSource";
import {WikipediaBoxOptions} from "../../UI/Wikipedia/WikipediaBox";
export default class Wikipedia {
@ -29,55 +30,133 @@ export default class Wikipedia {
private static readonly _cache = new Map<string, UIEventSource<{ success: string } | { error: any }>>()
public static GetArticle(options: {
pageName: string,
language?: "en" | string,
firstParagraphOnly?: false | boolean
}): UIEventSource<{ success: string } | { error: any }> {
const key = (options.language ?? "en") + ":" + options.pageName + ":" + (options.firstParagraphOnly ?? false)
const cached = Wikipedia._cache.get(key)
if (cached !== undefined) {
return cached
}
const v = UIEventSource.FromPromiseWithErr(Wikipedia.GetArticleAsync(options))
Wikipedia._cache.set(key, v)
return v;
}
public static getDataUrl(options: {language?: "en" | string, pageName: string}): string{
return `https://${options.language ?? "en"}.wikipedia.org/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + options.pageName
}
private readonly _backend: string;
public static getPageUrl(options: {language?: "en" | string, pageName: string}): string{
return `https://${options.language ?? "en"}.wikipedia.org/wiki/` + options.pageName
constructor(options?: ({ language?: "en" | string } | { backend?: string })) {
this._backend = Wikipedia.getBackendUrl(options ?? {});
}
/**
* Tries to extract the language and article name from the given string
*
*
* Wikipedia.extractLanguageAndName("qsdf") // => undefined
* Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
*/
public static extractLanguageAndName(input: string):{language: string, pageName: string} {
public static extractLanguageAndName(input: string): { language: string, pageName: string } {
const matched = input.match("([^:]+):(.*)")
if(matched === undefined || matched === null){
if (matched === undefined || matched === null) {
return undefined
}
const [_ , language, pageName] = matched
const [_, language, pageName] = matched
return {
language, pageName
}
}
public static async GetArticleAsync(options: {
pageName: string,
language?: "en" | string,
firstParagraphOnly?: false | boolean
}): Promise<string> {
const response = await Utils.downloadJson(Wikipedia.getDataUrl(options))
/**
* Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
*
* new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
* new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
*/
public extractPageName(input: string):string | undefined{
if(!input.startsWith(this._backend)){
return undefined
}
input = input.substring(this._backend.length);
const matched = input.match("/?wiki/\(.+\)")
if (matched === undefined || matched === null) {
return undefined
}
const [_, pageName] = matched
return pageName
}
private static getBackendUrl(options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }): string {
let backend = "en.wikipedia.org"
if (options["backend"]) {
backend = options["backend"]
} else if (options["language"]) {
backend = `${options["language"] ?? "en"}.wikipedia.org`
}
if (!backend.startsWith("http")) {
backend = "https://" + backend
}
return backend
}
public GetArticle(pageName: string, options: WikipediaBoxOptions): UIEventSource<{ success: string } | { error: any }> {
const key = this._backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false)
const cached = Wikipedia._cache.get(key)
if (cached !== undefined) {
return cached
}
const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options))
Wikipedia._cache.set(key, v)
return v;
}
public getDataUrl(pageName: string): string {
return `${this._backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
}
public getPageUrl(pageName: string): string {
return `${this._backend}/wiki/${pageName}`
}
/**
* Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
* @param searchTerm
*/
public async search(searchTerm: string): Promise<{ title: string, snippet: string }[]> {
const url = this._backend + "/w/api.php?action=query&format=json&list=search&srsearch=" + encodeURIComponent(searchTerm);
return (await Utils.downloadJson(url))["query"]["search"];
}
/**
* Searches via 'index.php' and scrapes the result.
* This gives better results then via the API
* @param searchTerm
*/
public async searchViaIndex(searchTerm: string): Promise<{ title: string, snippet: string, url: string } []> {
const url = `${this._backend}/w/index.php?search=${encodeURIComponent(searchTerm)}`
const result = await Utils.downloadAdvanced(url);
if(result["redirect"] ){
// This is an exact match
return [{
title: this.extractPageName(result["redirect"]),
url: result["redirect"],
snippet: ""
}]
}
const el = document.createElement('html');
el.innerHTML = result["content"].replace(/href="\//g, "href=\""+this._backend+"/");
const searchResults = el.getElementsByClassName("mw-search-results")
const individualResults = Array.from(searchResults[0]?.getElementsByClassName("mw-search-result") ?? [])
return individualResults.map(result => {
return {
title: result.getElementsByClassName("mw-search-result-heading")[0].textContent,
url: result.getElementsByTagName("a")[0].href,
snippet: result.getElementsByClassName("searchresult")[0].textContent
}
})
}
public async GetArticleAsync(pageName: string, options:
{
firstParagraphOnly?: false | boolean
}): Promise<string | undefined> {
const response = await Utils.downloadJson(this.getDataUrl(pageName))
if (response?.parse?.text === undefined) {
return undefined
}
const html = response["parse"]["text"]["*"];
if (html === undefined) {
return undefined
}
const div = document.createElement("div")
div.innerHTML = html
const content = Array.from(div.children)[0]
@ -98,11 +177,10 @@ export default class Wikipedia {
const links = Array.from(content.getElementsByTagName("a"))
// Rewrite relative links to absolute links + open them in a new tab
const language = options.language ?? "en"
links.filter(link => link.getAttribute("href")?.startsWith("/") ?? false).forEach(link => {
link.target = '_blank'
// note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
link.href = `https://${language}.wikipedia.org${link.getAttribute("href")}`;
link.href = `${this._backend}${link.getAttribute("href")}`;
})
if (options?.firstParagraphOnly) {