Cleanup of wikipedia and download functions

This commit is contained in:
pietervdvn 2022-05-26 13:23:25 +02:00
parent 9bedf8e681
commit 48953cf266
4 changed files with 167 additions and 84 deletions

View file

@ -3,6 +3,7 @@
*/
import {Utils} from "../../Utils";
import {UIEventSource} from "../UIEventSource";
import {WikipediaBoxOptions} from "../../UI/Wikipedia/WikipediaBox";
export default class Wikipedia {
@ -29,55 +30,133 @@ export default class Wikipedia {
private static readonly _cache = new Map<string, UIEventSource<{ success: string } | { error: any }>>()
public static GetArticle(options: {
pageName: string,
language?: "en" | string,
firstParagraphOnly?: false | boolean
}): UIEventSource<{ success: string } | { error: any }> {
const key = (options.language ?? "en") + ":" + options.pageName + ":" + (options.firstParagraphOnly ?? false)
const cached = Wikipedia._cache.get(key)
if (cached !== undefined) {
return cached
}
const v = UIEventSource.FromPromiseWithErr(Wikipedia.GetArticleAsync(options))
Wikipedia._cache.set(key, v)
return v;
}
public static getDataUrl(options: {language?: "en" | string, pageName: string}): string{
return `https://${options.language ?? "en"}.wikipedia.org/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + options.pageName
}
private readonly _backend: string;
public static getPageUrl(options: {language?: "en" | string, pageName: string}): string{
return `https://${options.language ?? "en"}.wikipedia.org/wiki/` + options.pageName
constructor(options?: ({ language?: "en" | string } | { backend?: string })) {
this._backend = Wikipedia.getBackendUrl(options ?? {});
}
/**
* Tries to extract the language and article name from the given string
*
*
* Wikipedia.extractLanguageAndName("qsdf") // => undefined
* Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
*/
public static extractLanguageAndName(input: string):{language: string, pageName: string} {
public static extractLanguageAndName(input: string): { language: string, pageName: string } {
const matched = input.match("([^:]+):(.*)")
if(matched === undefined || matched === null){
if (matched === undefined || matched === null) {
return undefined
}
const [_ , language, pageName] = matched
const [_, language, pageName] = matched
return {
language, pageName
}
}
public static async GetArticleAsync(options: {
pageName: string,
language?: "en" | string,
firstParagraphOnly?: false | boolean
}): Promise<string> {
const response = await Utils.downloadJson(Wikipedia.getDataUrl(options))
/**
* Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
*
* new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
* new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
*/
public extractPageName(input: string):string | undefined{
if(!input.startsWith(this._backend)){
return undefined
}
input = input.substring(this._backend.length);
const matched = input.match("/?wiki/\(.+\)")
if (matched === undefined || matched === null) {
return undefined
}
const [_, pageName] = matched
return pageName
}
private static getBackendUrl(options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }): string {
let backend = "en.wikipedia.org"
if (options["backend"]) {
backend = options["backend"]
} else if (options["language"]) {
backend = `${options["language"] ?? "en"}.wikipedia.org`
}
if (!backend.startsWith("http")) {
backend = "https://" + backend
}
return backend
}
public GetArticle(pageName: string, options: WikipediaBoxOptions): UIEventSource<{ success: string } | { error: any }> {
const key = this._backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false)
const cached = Wikipedia._cache.get(key)
if (cached !== undefined) {
return cached
}
const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options))
Wikipedia._cache.set(key, v)
return v;
}
public getDataUrl(pageName: string): string {
return `${this._backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
}
public getPageUrl(pageName: string): string {
return `${this._backend}/wiki/${pageName}`
}
/**
* Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
* @param searchTerm
*/
public async search(searchTerm: string): Promise<{ title: string, snippet: string }[]> {
const url = this._backend + "/w/api.php?action=query&format=json&list=search&srsearch=" + encodeURIComponent(searchTerm);
return (await Utils.downloadJson(url))["query"]["search"];
}
/**
* Searches via 'index.php' and scrapes the result.
* This gives better results then via the API
* @param searchTerm
*/
public async searchViaIndex(searchTerm: string): Promise<{ title: string, snippet: string, url: string } []> {
const url = `${this._backend}/w/index.php?search=${encodeURIComponent(searchTerm)}`
const result = await Utils.downloadAdvanced(url);
if(result["redirect"] ){
// This is an exact match
return [{
title: this.extractPageName(result["redirect"]),
url: result["redirect"],
snippet: ""
}]
}
const el = document.createElement('html');
el.innerHTML = result["content"].replace(/href="\//g, "href=\""+this._backend+"/");
const searchResults = el.getElementsByClassName("mw-search-results")
const individualResults = Array.from(searchResults[0]?.getElementsByClassName("mw-search-result") ?? [])
return individualResults.map(result => {
return {
title: result.getElementsByClassName("mw-search-result-heading")[0].textContent,
url: result.getElementsByTagName("a")[0].href,
snippet: result.getElementsByClassName("searchresult")[0].textContent
}
})
}
public async GetArticleAsync(pageName: string, options:
{
firstParagraphOnly?: false | boolean
}): Promise<string | undefined> {
const response = await Utils.downloadJson(this.getDataUrl(pageName))
if (response?.parse?.text === undefined) {
return undefined
}
const html = response["parse"]["text"]["*"];
if (html === undefined) {
return undefined
}
const div = document.createElement("div")
div.innerHTML = html
const content = Array.from(div.children)[0]
@ -98,11 +177,10 @@ export default class Wikipedia {
const links = Array.from(content.getElementsByTagName("a"))
// Rewrite relative links to absolute links + open them in a new tab
const language = options.language ?? "en"
links.filter(link => link.getAttribute("href")?.startsWith("/") ?? false).forEach(link => {
link.target = '_blank'
// note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
link.href = `https://${language}.wikipedia.org${link.getAttribute("href")}`;
link.href = `${this._backend}${link.getAttribute("href")}`;
})
if (options?.firstParagraphOnly) {

View file

@ -16,18 +16,17 @@ import Link from "../Base/Link";
import WikidataPreviewBox from "./WikidataPreviewBox";
import {Paragraph} from "../Base/Paragraph";
export interface WikipediaBoxOptions {
addHeader: boolean,
firstParagraphOnly: boolean
}
export default class WikipediaBox extends Combine {
public static configuration = {
onlyFirstParagaph: false,
addHeader: false
}
constructor(wikidataIds: string[]) {
constructor(wikidataIds: string[], options?: WikipediaBoxOptions) {
const mainContents = []
const pages = wikidataIds.map(entry => WikipediaBox.createLinkedContent(entry.trim()))
options = options??{addHeader: false, firstParagraphOnly: true};
const pages = wikidataIds.map(entry => WikipediaBox.createLinkedContent(entry.trim(), options))
if (wikidataIds.length == 1) {
const page = pages[0]
mainContents.push(
@ -68,31 +67,29 @@ export default class WikipediaBox extends Combine {
super(mainContents)
this.SetClass("block rounded-xl subtle-background m-1 p-2 flex flex-col")
.SetStyle("max-height: inherit")
}
private static createLinkedContent(entry: string): {
private static createLinkedContent(entry: string, options: WikipediaBoxOptions): {
titleElement: BaseUIElement,
contents: BaseUIElement,
linkElement: BaseUIElement
} {
if (entry.match("[qQ][0-9]+")) {
return WikipediaBox.createWikidatabox(entry)
return WikipediaBox.createWikidatabox(entry, options)
} else {
console.log("Creating wikipedia box for ", entry)
return WikipediaBox.createWikipediabox(entry)
return WikipediaBox.createWikipediabox(entry, options)
}
}
/**
* Given a '<language>:<article-name>'-string, constructs the wikipedia article
* @param wikipediaArticle
* @private
*/
private static createWikipediabox(wikipediaArticle: string): {
private static createWikipediabox(wikipediaArticle: string, options: WikipediaBoxOptions): {
titleElement: BaseUIElement,
contents: BaseUIElement,
linkElement: BaseUIElement
@ -107,12 +104,13 @@ export default class WikipediaBox extends Combine {
linkElement: undefined
}
}
const url = Wikipedia.getPageUrl(article) // `https://${language}.wikipedia.org/wiki/${pagetitle}`
const wikipedia = new Wikipedia({language: article.language})
const url = wikipedia.getPageUrl(article.pageName)
const linkElement = new Link(Svg.pop_out_svg().SetStyle("width: 1.2rem").SetClass("block "), url, true) .SetClass("flex items-center enable-links")
return {
titleElement: new Title(article.pageName, 3),
contents: WikipediaBox.createContents(article.pageName, article.language),
contents: WikipediaBox.createContents(article.pageName, wikipedia, options),
linkElement
}
}
@ -120,7 +118,7 @@ export default class WikipediaBox extends Combine {
/**
* Given a `Q1234`, constructs a wikipedia box or wikidata box
*/
private static createWikidatabox(wikidataId: string): {
private static createWikidatabox(wikidataId: string, options: WikipediaBoxOptions): {
titleElement: BaseUIElement,
contents: BaseUIElement,
linkElement: BaseUIElement
@ -176,8 +174,9 @@ export default class WikipediaBox extends Combine {
}
const [pagetitle, language, wd] = <[string, string, WikidataResponse]>status
const wikipedia = new Wikipedia({language})
const quickFacts = WikidataPreviewBox.QuickFacts(wd);
return WikipediaBox.createContents(pagetitle, language, quickFacts)
return WikipediaBox.createContents(pagetitle, wikipedia, {topBar: quickFacts, ...options})
})
)
@ -223,13 +222,9 @@ export default class WikipediaBox extends Combine {
/**
* Returns the actual content in a scrollable way
*/
private static createContents(pagename: string, language: string, topBar?: BaseUIElement): BaseUIElement {
const wpOptions = {
pageName: pagename,
language: language,
firstParagraphOnly: WikipediaBox.configuration.onlyFirstParagaph
}
const htmlContent = Wikipedia.GetArticle(wpOptions)
private static createContents(pagename: string, wikipedia: Wikipedia, options:{
topBar?: BaseUIElement} & WikipediaBoxOptions): BaseUIElement {
const htmlContent = wikipedia.GetArticle(pagename, options)
const wp = Translations.t.general.wikipedia
const contents: UIEventSource<string | BaseUIElement> = htmlContent.map(htmlContent => {
if (htmlContent === undefined) {
@ -238,11 +233,11 @@ export default class WikipediaBox extends Combine {
}
if (htmlContent["success"] !== undefined) {
let content: BaseUIElement = new FixedUiElement(htmlContent["success"]);
if (WikipediaBox.configuration.addHeader) {
if (options?.addHeader) {
content = new Combine(
[
new Paragraph(
new Link(wp.fromWikipedia, Wikipedia.getPageUrl(wpOptions), true),
new Link(wp.fromWikipedia, wikipedia.getPageUrl(pagename), true),
),
new Paragraph(
content
@ -261,7 +256,7 @@ export default class WikipediaBox extends Combine {
})
return new Combine([
topBar?.SetClass("border-2 border-grey rounded-lg m-1 mb-0"),
options?.topBar?.SetClass("border-2 border-grey rounded-lg m-1 mb-0"),
new VariableUiElement(contents)
.SetClass("block pl-6 pt-2")])
}

View file

@ -9,7 +9,7 @@ export class Utils {
*/
public static runningFromConsole = typeof window === "undefined";
public static readonly assets_path = "./assets/svg/";
public static externalDownloadFunction: (url: string, headers?: any) => Promise<any>;
public static externalDownloadFunction: (url: string, headers?: any) => Promise<{ content: string } | { redirect: string }>;
public static Special_visualizations_tagsToApplyHelpText = `These can either be a tag to add, such as \`amenity=fast_food\` or can use a substitution, e.g. \`addr:housenumber=$number\`.
This new point will then have the tags \`amenity=fast_food\` and \`addr:housenumber\` with the value that was saved in \`number\` in the original feature.
@ -517,17 +517,17 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
/**
* Apply a function on every leaf of the JSON; used to rewrite parts of the JSON.
* Returns a modified copy of the original object.
*
*
* 'null' and 'undefined' are _always_ considered a leaf, even if 'isLeaf' says it isn't
*
*
* Hangs if the object contains a loop
*
*
* // should walk a json
* const walked = Utils.WalkJson({
* key: "value"
* }, (x: string) => x + "!")
* walked // => {key: "value!"}
*
*
* // should preserve undefined and null:
* const walked = Utils.WalkJson({
* u: undefined,
@ -535,7 +535,7 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
* v: "value"
* }, (x) => {if(x !== undefined && x !== null){return x+"!}; return x})
* walked // => {v: "value!", u: undefined, n: null}
*
*
* // should preserve undefined and null, also with a negative isLeaf:
* const walked = Utils.WalkJson({
* u: undefined,
@ -561,8 +561,8 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
return f(json, path)
}
if (Array.isArray(json)) {
return json.map((sub,i) => {
return Utils.WalkJson(sub, f, isLeaf, [...path,""+i]);
return json.map((sub, i) => {
return Utils.WalkJson(sub, f, isLeaf, [...path, "" + i]);
})
}
@ -575,7 +575,7 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
/**
* Walks an object recursively, will execute the 'collect'-callback on every leaf.
*
*
* Will hang on objects with loops
*/
static WalkObject(json: any, collect: (v: number | string | boolean | undefined, path: string[]) => any, isLeaf: (object) => boolean = undefined, path = []): void {
@ -664,7 +664,16 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
Utils.injectedDownloads[url] = data
}
public static download(url: string, headers?: any): Promise<string> {
public static async download(url: string, headers?: any): Promise<string | undefined> {
return (await Utils.downloadAdvanced(url, headers))["content"]
}
/**
* Download function which also indicates advanced options, such as redirects
* @param url
* @param headers
*/
public static downloadAdvanced(url: string, headers?: any): Promise<{ content: string } | { redirect: string }> {
if (this.externalDownloadFunction !== undefined) {
return this.externalDownloadFunction(url, headers)
}
@ -673,7 +682,9 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
const xhr = new XMLHttpRequest();
xhr.onload = () => {
if (xhr.status == 200) {
resolve(xhr.response)
resolve({content: xhr.response})
} else if (xhr.status === 302) {
resolve({redirect: xhr.getResponseHeader("location")})
} else if (xhr.status === 509 || xhr.status === 429) {
reject("rate limited")
} else {
@ -682,7 +693,6 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be
};
xhr.open('GET', url);
if (headers !== undefined) {
for (const key in headers) {
xhr.setRequestHeader(key, headers[key])
}

View file

@ -5,10 +5,11 @@ import * as https from "https";
import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson";
import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson";
import xml2js from 'xml2js';
export default class ScriptUtils {
public static fixUtils() {
Utils.externalDownloadFunction = ScriptUtils.DownloadJSON
Utils.externalDownloadFunction = ScriptUtils.Download
}
@ -44,8 +45,13 @@ export default class ScriptUtils {
})
}
private static async DownloadJSON(url: string, headers?: any): Promise<any>{
const data = await ScriptUtils.Download(url, headers);
return JSON.parse(data.content)
}
private static DownloadJSON(url, headers?: any): Promise<any> {
private static Download(url, headers?: any): Promise<{content: string}> {
return new Promise((resolve, reject) => {
try {
headers = headers ?? {}
@ -67,13 +73,7 @@ export default class ScriptUtils {
});
res.addListener('end', function () {
const result = parts.join("")
try {
resolve(JSON.parse(result))
} catch (e) {
console.error("Could not parse the following as JSON:", result)
reject(e)
}
resolve({content: parts.join("")})
});
})
} catch (e) {