From 48953cf266f71ff2a53be8f74c2d6ef2fb03e95c Mon Sep 17 00:00:00 2001 From: pietervdvn Date: Thu, 26 May 2022 13:23:25 +0200 Subject: [PATCH] Cleanup of wikipedia and download functions --- Logic/Web/Wikipedia.ts | 144 +++++++++++++++++++++++++++-------- UI/Wikipedia/WikipediaBox.ts | 55 ++++++------- Utils.ts | 34 ++++++--- scripts/ScriptUtils.ts | 18 ++--- 4 files changed, 167 insertions(+), 84 deletions(-) diff --git a/Logic/Web/Wikipedia.ts b/Logic/Web/Wikipedia.ts index e96df52b70..3b72024621 100644 --- a/Logic/Web/Wikipedia.ts +++ b/Logic/Web/Wikipedia.ts @@ -3,6 +3,7 @@ */ import {Utils} from "../../Utils"; import {UIEventSource} from "../UIEventSource"; +import {WikipediaBoxOptions} from "../../UI/Wikipedia/WikipediaBox"; export default class Wikipedia { @@ -29,55 +30,133 @@ export default class Wikipedia { private static readonly _cache = new Map>() - public static GetArticle(options: { - pageName: string, - language?: "en" | string, - firstParagraphOnly?: false | boolean - }): UIEventSource<{ success: string } | { error: any }> { - const key = (options.language ?? "en") + ":" + options.pageName + ":" + (options.firstParagraphOnly ?? false) - const cached = Wikipedia._cache.get(key) - if (cached !== undefined) { - return cached - } - const v = UIEventSource.FromPromiseWithErr(Wikipedia.GetArticleAsync(options)) - Wikipedia._cache.set(key, v) - return v; - } - public static getDataUrl(options: {language?: "en" | string, pageName: string}): string{ - return `https://${options.language ?? "en"}.wikipedia.org/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + options.pageName - } + private readonly _backend: string; - public static getPageUrl(options: {language?: "en" | string, pageName: string}): string{ - return `https://${options.language ?? "en"}.wikipedia.org/wiki/` + options.pageName + constructor(options?: ({ language?: "en" | string } | { backend?: string })) { + this._backend = Wikipedia.getBackendUrl(options ?? {}); } /** * Tries to extract the language and article name from the given string - * + * * Wikipedia.extractLanguageAndName("qsdf") // => undefined * Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"} */ - public static extractLanguageAndName(input: string):{language: string, pageName: string} { + public static extractLanguageAndName(input: string): { language: string, pageName: string } { const matched = input.match("([^:]+):(.*)") - if(matched === undefined || matched === null){ + if (matched === undefined || matched === null) { return undefined } - const [_ , language, pageName] = matched + const [_, language, pageName] = matched return { language, pageName } } - - public static async GetArticleAsync(options: { - pageName: string, - language?: "en" | string, - firstParagraphOnly?: false | boolean - }): Promise { - const response = await Utils.downloadJson(Wikipedia.getDataUrl(options)) + /** + * Extracts the actual pagename; returns undefined if this came from a different wikimedia entry + * + * new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos" + * new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined + */ + public extractPageName(input: string):string | undefined{ + if(!input.startsWith(this._backend)){ + return undefined + } + input = input.substring(this._backend.length); + + const matched = input.match("/?wiki/\(.+\)") + if (matched === undefined || matched === null) { + return undefined + } + const [_, pageName] = matched + return pageName + } + + private static getBackendUrl(options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }): string { + let backend = "en.wikipedia.org" + if (options["backend"]) { + backend = options["backend"] + } else if (options["language"]) { + backend = `${options["language"] ?? "en"}.wikipedia.org` + } + if (!backend.startsWith("http")) { + backend = "https://" + backend + } + return backend + } + + public GetArticle(pageName: string, options: WikipediaBoxOptions): UIEventSource<{ success: string } | { error: any }> { + const key = this._backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false) + const cached = Wikipedia._cache.get(key) + if (cached !== undefined) { + return cached + } + const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options)) + Wikipedia._cache.set(key, v) + return v; + } + + public getDataUrl(pageName: string): string { + return `${this._backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName + } + + public getPageUrl(pageName: string): string { + return `${this._backend}/wiki/${pageName}` + } + + /** + * Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead + * @param searchTerm + */ + public async search(searchTerm: string): Promise<{ title: string, snippet: string }[]> { + const url = this._backend + "/w/api.php?action=query&format=json&list=search&srsearch=" + encodeURIComponent(searchTerm); + return (await Utils.downloadJson(url))["query"]["search"]; + } + + /** + * Searches via 'index.php' and scrapes the result. + * This gives better results then via the API + * @param searchTerm + */ + public async searchViaIndex(searchTerm: string): Promise<{ title: string, snippet: string, url: string } []> { + const url = `${this._backend}/w/index.php?search=${encodeURIComponent(searchTerm)}` + const result = await Utils.downloadAdvanced(url); + if(result["redirect"] ){ + // This is an exact match + return [{ + title: this.extractPageName(result["redirect"]), + url: result["redirect"], + snippet: "" + }] + } + const el = document.createElement('html'); + el.innerHTML = result["content"].replace(/href="\//g, "href=\""+this._backend+"/"); + const searchResults = el.getElementsByClassName("mw-search-results") + const individualResults = Array.from(searchResults[0]?.getElementsByClassName("mw-search-result") ?? []) + return individualResults.map(result => { + return { + title: result.getElementsByClassName("mw-search-result-heading")[0].textContent, + url: result.getElementsByTagName("a")[0].href, + snippet: result.getElementsByClassName("searchresult")[0].textContent + } + }) + } + + public async GetArticleAsync(pageName: string, options: + { + firstParagraphOnly?: false | boolean + }): Promise { + + const response = await Utils.downloadJson(this.getDataUrl(pageName)) + if (response?.parse?.text === undefined) { + return undefined + } const html = response["parse"]["text"]["*"]; - + if (html === undefined) { + return undefined + } const div = document.createElement("div") div.innerHTML = html const content = Array.from(div.children)[0] @@ -98,11 +177,10 @@ export default class Wikipedia { const links = Array.from(content.getElementsByTagName("a")) // Rewrite relative links to absolute links + open them in a new tab - const language = options.language ?? "en" links.filter(link => link.getAttribute("href")?.startsWith("/") ?? false).forEach(link => { link.target = '_blank' // note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths - link.href = `https://${language}.wikipedia.org${link.getAttribute("href")}`; + link.href = `${this._backend}${link.getAttribute("href")}`; }) if (options?.firstParagraphOnly) { diff --git a/UI/Wikipedia/WikipediaBox.ts b/UI/Wikipedia/WikipediaBox.ts index fa55376c5d..ec9722fdca 100644 --- a/UI/Wikipedia/WikipediaBox.ts +++ b/UI/Wikipedia/WikipediaBox.ts @@ -16,18 +16,17 @@ import Link from "../Base/Link"; import WikidataPreviewBox from "./WikidataPreviewBox"; import {Paragraph} from "../Base/Paragraph"; +export interface WikipediaBoxOptions { + addHeader: boolean, + firstParagraphOnly: boolean +} + export default class WikipediaBox extends Combine { - public static configuration = { - onlyFirstParagaph: false, - addHeader: false - } - - constructor(wikidataIds: string[]) { - + constructor(wikidataIds: string[], options?: WikipediaBoxOptions) { const mainContents = [] - - const pages = wikidataIds.map(entry => WikipediaBox.createLinkedContent(entry.trim())) + options = options??{addHeader: false, firstParagraphOnly: true}; + const pages = wikidataIds.map(entry => WikipediaBox.createLinkedContent(entry.trim(), options)) if (wikidataIds.length == 1) { const page = pages[0] mainContents.push( @@ -68,31 +67,29 @@ export default class WikipediaBox extends Combine { super(mainContents) - + this.SetClass("block rounded-xl subtle-background m-1 p-2 flex flex-col") .SetStyle("max-height: inherit") } - private static createLinkedContent(entry: string): { + private static createLinkedContent(entry: string, options: WikipediaBoxOptions): { titleElement: BaseUIElement, contents: BaseUIElement, linkElement: BaseUIElement } { if (entry.match("[qQ][0-9]+")) { - return WikipediaBox.createWikidatabox(entry) + return WikipediaBox.createWikidatabox(entry, options) } else { console.log("Creating wikipedia box for ", entry) - return WikipediaBox.createWikipediabox(entry) + return WikipediaBox.createWikipediabox(entry, options) } } /** * Given a ':'-string, constructs the wikipedia article - * @param wikipediaArticle - * @private */ - private static createWikipediabox(wikipediaArticle: string): { + private static createWikipediabox(wikipediaArticle: string, options: WikipediaBoxOptions): { titleElement: BaseUIElement, contents: BaseUIElement, linkElement: BaseUIElement @@ -107,12 +104,13 @@ export default class WikipediaBox extends Combine { linkElement: undefined } } - const url = Wikipedia.getPageUrl(article) // `https://${language}.wikipedia.org/wiki/${pagetitle}` + const wikipedia = new Wikipedia({language: article.language}) + const url = wikipedia.getPageUrl(article.pageName) const linkElement = new Link(Svg.pop_out_svg().SetStyle("width: 1.2rem").SetClass("block "), url, true) .SetClass("flex items-center enable-links") return { titleElement: new Title(article.pageName, 3), - contents: WikipediaBox.createContents(article.pageName, article.language), + contents: WikipediaBox.createContents(article.pageName, wikipedia, options), linkElement } } @@ -120,7 +118,7 @@ export default class WikipediaBox extends Combine { /** * Given a `Q1234`, constructs a wikipedia box or wikidata box */ - private static createWikidatabox(wikidataId: string): { + private static createWikidatabox(wikidataId: string, options: WikipediaBoxOptions): { titleElement: BaseUIElement, contents: BaseUIElement, linkElement: BaseUIElement @@ -176,8 +174,9 @@ export default class WikipediaBox extends Combine { } const [pagetitle, language, wd] = <[string, string, WikidataResponse]>status + const wikipedia = new Wikipedia({language}) const quickFacts = WikidataPreviewBox.QuickFacts(wd); - return WikipediaBox.createContents(pagetitle, language, quickFacts) + return WikipediaBox.createContents(pagetitle, wikipedia, {topBar: quickFacts, ...options}) }) ) @@ -223,13 +222,9 @@ export default class WikipediaBox extends Combine { /** * Returns the actual content in a scrollable way */ - private static createContents(pagename: string, language: string, topBar?: BaseUIElement): BaseUIElement { - const wpOptions = { - pageName: pagename, - language: language, - firstParagraphOnly: WikipediaBox.configuration.onlyFirstParagaph - } - const htmlContent = Wikipedia.GetArticle(wpOptions) + private static createContents(pagename: string, wikipedia: Wikipedia, options:{ + topBar?: BaseUIElement} & WikipediaBoxOptions): BaseUIElement { + const htmlContent = wikipedia.GetArticle(pagename, options) const wp = Translations.t.general.wikipedia const contents: UIEventSource = htmlContent.map(htmlContent => { if (htmlContent === undefined) { @@ -238,11 +233,11 @@ export default class WikipediaBox extends Combine { } if (htmlContent["success"] !== undefined) { let content: BaseUIElement = new FixedUiElement(htmlContent["success"]); - if (WikipediaBox.configuration.addHeader) { + if (options?.addHeader) { content = new Combine( [ new Paragraph( - new Link(wp.fromWikipedia, Wikipedia.getPageUrl(wpOptions), true), + new Link(wp.fromWikipedia, wikipedia.getPageUrl(pagename), true), ), new Paragraph( content @@ -261,7 +256,7 @@ export default class WikipediaBox extends Combine { }) return new Combine([ - topBar?.SetClass("border-2 border-grey rounded-lg m-1 mb-0"), + options?.topBar?.SetClass("border-2 border-grey rounded-lg m-1 mb-0"), new VariableUiElement(contents) .SetClass("block pl-6 pt-2")]) } diff --git a/Utils.ts b/Utils.ts index 6c3e862a91..acf0bd110d 100644 --- a/Utils.ts +++ b/Utils.ts @@ -9,7 +9,7 @@ export class Utils { */ public static runningFromConsole = typeof window === "undefined"; public static readonly assets_path = "./assets/svg/"; - public static externalDownloadFunction: (url: string, headers?: any) => Promise; + public static externalDownloadFunction: (url: string, headers?: any) => Promise<{ content: string } | { redirect: string }>; public static Special_visualizations_tagsToApplyHelpText = `These can either be a tag to add, such as \`amenity=fast_food\` or can use a substitution, e.g. \`addr:housenumber=$number\`. This new point will then have the tags \`amenity=fast_food\` and \`addr:housenumber\` with the value that was saved in \`number\` in the original feature. @@ -517,17 +517,17 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be /** * Apply a function on every leaf of the JSON; used to rewrite parts of the JSON. * Returns a modified copy of the original object. - * + * * 'null' and 'undefined' are _always_ considered a leaf, even if 'isLeaf' says it isn't - * + * * Hangs if the object contains a loop - * + * * // should walk a json * const walked = Utils.WalkJson({ * key: "value" * }, (x: string) => x + "!") * walked // => {key: "value!"} - * + * * // should preserve undefined and null: * const walked = Utils.WalkJson({ * u: undefined, @@ -535,7 +535,7 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be * v: "value" * }, (x) => {if(x !== undefined && x !== null){return x+"!}; return x}) * walked // => {v: "value!", u: undefined, n: null} - * + * * // should preserve undefined and null, also with a negative isLeaf: * const walked = Utils.WalkJson({ * u: undefined, @@ -561,8 +561,8 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be return f(json, path) } if (Array.isArray(json)) { - return json.map((sub,i) => { - return Utils.WalkJson(sub, f, isLeaf, [...path,""+i]); + return json.map((sub, i) => { + return Utils.WalkJson(sub, f, isLeaf, [...path, "" + i]); }) } @@ -575,7 +575,7 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be /** * Walks an object recursively, will execute the 'collect'-callback on every leaf. - * + * * Will hang on objects with loops */ static WalkObject(json: any, collect: (v: number | string | boolean | undefined, path: string[]) => any, isLeaf: (object) => boolean = undefined, path = []): void { @@ -664,7 +664,16 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be Utils.injectedDownloads[url] = data } - public static download(url: string, headers?: any): Promise { + public static async download(url: string, headers?: any): Promise { + return (await Utils.downloadAdvanced(url, headers))["content"] + } + + /** + * Download function which also indicates advanced options, such as redirects + * @param url + * @param headers + */ + public static downloadAdvanced(url: string, headers?: any): Promise<{ content: string } | { redirect: string }> { if (this.externalDownloadFunction !== undefined) { return this.externalDownloadFunction(url, headers) } @@ -673,7 +682,9 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be const xhr = new XMLHttpRequest(); xhr.onload = () => { if (xhr.status == 200) { - resolve(xhr.response) + resolve({content: xhr.response}) + } else if (xhr.status === 302) { + resolve({redirect: xhr.getResponseHeader("location")}) } else if (xhr.status === 509 || xhr.status === 429) { reject("rate limited") } else { @@ -682,7 +693,6 @@ In the case that MapComplete is pointed to the testing grounds, the edit will be }; xhr.open('GET', url); if (headers !== undefined) { - for (const key in headers) { xhr.setRequestHeader(key, headers[key]) } diff --git a/scripts/ScriptUtils.ts b/scripts/ScriptUtils.ts index ae938d87bf..09e87cc617 100644 --- a/scripts/ScriptUtils.ts +++ b/scripts/ScriptUtils.ts @@ -5,10 +5,11 @@ import * as https from "https"; import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson"; import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson"; import xml2js from 'xml2js'; + export default class ScriptUtils { public static fixUtils() { - Utils.externalDownloadFunction = ScriptUtils.DownloadJSON + Utils.externalDownloadFunction = ScriptUtils.Download } @@ -44,8 +45,13 @@ export default class ScriptUtils { }) } + + private static async DownloadJSON(url: string, headers?: any): Promise{ + const data = await ScriptUtils.Download(url, headers); + return JSON.parse(data.content) + } - private static DownloadJSON(url, headers?: any): Promise { + private static Download(url, headers?: any): Promise<{content: string}> { return new Promise((resolve, reject) => { try { headers = headers ?? {} @@ -67,13 +73,7 @@ export default class ScriptUtils { }); res.addListener('end', function () { - const result = parts.join("") - try { - resolve(JSON.parse(result)) - } catch (e) { - console.error("Could not parse the following as JSON:", result) - reject(e) - } + resolve({content: parts.join("")}) }); }) } catch (e) {