MapComplete/src/UI/InputElement/Validators/UrlValidator.ts

import { Validator } from "../Validator"
import { Translation } from "../../i18n/Translation"
import Translations from "../../i18n/Translations"

export default class UrlValidator extends Validator {
    private readonly _forceHttps: boolean

    private static readonly spamWebsites = new Set<string>([
        "booking.com",
        "hotel-details-guide.com",
        "tripingguide.com",
        "tripadvisor.com",
        "tripadvisor.co.uk",
        "tripadvisor.com.au",
        "katestravelexperience.eu",
        "hoteldetails.eu",
    ])

    private static readonly discouragedWebsites = new Set<string>(["facebook.com"])

    constructor(name?: string, explanation?: string, forceHttps?: boolean) {
        super(
            name ?? "url",
            explanation ??
                "The validatedTextField will format URLs to always be valid and have a https://-header (even though the 'https'-part will be hidden from the user. Furthermore, some tracking parameters will be removed",
            "url"
        )
        this._forceHttps = forceHttps ?? false
    }

    /**
     *
     * new UrlValidator().reformat("https://example.com/page?fbclid=123456&utm_source=mastodon") // => "https://example.com/page"
     */
    reformat(str: string): string {
        try {
            let url: URL
            // str = str.toLowerCase() // URLS are case sensitive. Lowercasing them might break some URLS. See #763
            if (
                !str.startsWith("http://") &&
                !str.startsWith("https://") &&
                !str.startsWith("http:")
            ) {
                url = new URL("https://" + str)
            } else {
                url = new URL(str)
            }
            if (this._forceHttps) {
                url.protocol = "https:"
            }
            const blacklistedTrackingParams = [
                "fbclid", // Oh god, how I hate the fbclid. Let it burn, burn in hell!
                "gclid",
                "cmpid",
                "agid",
                "utm",
                "utm_source",
                "utm_medium",
                "campaignid",
                "campaign",
                "AdGroupId",
                "AdGroup",
                "TargetId",
                "msclkid",
                "pk_source",
                "pk_medium",
                "pk_campaign",
                "pk_content",
                "pk_kwd",
            ]
            for (const dontLike of blacklistedTrackingParams) {
                url.searchParams.delete(dontLike.toLowerCase())
            }
            let cleaned = url.toString()
            if (cleaned.endsWith("/") && !str.endsWith("/")) {
                // Do not add a trailing '/' if it wasn't typed originally
                cleaned = cleaned.substr(0, cleaned.length - 1)
            }

            return cleaned
        } catch (e) {
            console.error(e)
            return undefined
        }
    }

    /**
     *
     * const v = new UrlValidator()
     * v.getFeedback("example.").textFor("en") // => "This is not a valid web address"
     * v.getFeedback("https://booking.com/some-hotel.html").textFor("en") // => Translations.t.validation.url.spamSite.Subs({host: "booking.com"}).textFor("en")
     */
    getFeedback(s: string, getCountry?: () => string): Translation | undefined {
        if (!s.startsWith("http://") && !s.startsWith("https://") && !s.startsWith("http:")) {
            s = "https://" + s
        }
        try {
            const url = new URL(s)
            let host = url.host.toLowerCase()
            if (host.startsWith("www.")) {
                host = host.slice(4)
            }
            if (UrlValidator.spamWebsites.has(host)) {
                return Translations.t.validation.url.spamSite.Subs({ host })
            }
            if (UrlValidator.discouragedWebsites.has(host)) {
                return Translations.t.validation.url.aggregator.Subs({ host })
            }
        } catch (e) {
            // pass
        }
        const upstream = super.getFeedback(s, getCountry)
        if (upstream) {
            return upstream
        }

        return undefined
    }

    /**
     * const v = new UrlValidator()
     * v.isValid("https://booking.com/some-hotel.html") // => false
     */
    isValid(str: string): boolean {
        try {
            if (
                !str.startsWith("http://") &&
                !str.startsWith("https://") &&
                !str.startsWith("http:")
            ) {
                str = "https://" + str
            }
            const url = new URL(str)

            let host = url.host.toLowerCase()
            if (host.startsWith("www.")) {
                host = host.slice(4)
            }
            if (UrlValidator.spamWebsites.has(host)) {
                return false
            }

            const dotIndex = url.host.indexOf(".")
            return dotIndex > 0 && url.host[url.host.length - 1] !== "."
        } catch (e) {
            return false
        }
    }
}
refactoring: Fix generate:layeroverview 2023-03-29 17:56:42 +02:00			`import { Validator } from "../Validator"`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`import { Translation } from "../../i18n/Translation"`
			`import Translations from "../../i18n/Translations"`
More refactoring 2023-03-29 17:21:20 +02:00
			`export default class UrlValidator extends Validator {`
Add linked data module which scrapes websites 2024-02-22 18:58:34 +01:00			`private readonly _forceHttps: boolean`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`private static readonly spamWebsites = new Set<string>([`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`"booking.com",`
chore: automated housekeeping... 2024-08-23 13:13:41 +02:00			`"hotel-details-guide.com",`
			`"tripingguide.com",`
			`"tripadvisor.com",`
			`"tripadvisor.co.uk",`
			`"tripadvisor.com.au",`
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`"katestravelexperience.eu",`
chore: automated housekeeping... 2024-09-02 12:48:15 +02:00			`"hoteldetails.eu",`
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`])`

chore: automated housekeeping... 2024-09-02 12:48:15 +02:00			`private static readonly discouragedWebsites = new Set<string>(["facebook.com"])`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00
Add linked data module which scrapes websites 2024-02-22 18:58:34 +01:00			`constructor(name?: string, explanation?: string, forceHttps?: boolean) {`
More refactoring 2023-03-29 17:21:20 +02:00			`super(`
Chore: formatting 2023-11-09 16:30:26 +01:00			`name ?? "url",`
			`explanation ??`
chore: automated housekeeping... 2024-08-23 13:13:41 +02:00			`"The validatedTextField will format URLs to always be valid and have a https://-header (even though the 'https'-part will be hidden from the user. Furthermore, some tracking parameters will be removed",`
			`"url"`
More refactoring 2023-03-29 17:21:20 +02:00			`)`
Add linked data module which scrapes websites 2024-02-22 18:58:34 +01:00			`this._forceHttps = forceHttps ?? false`
More refactoring 2023-03-29 17:21:20 +02:00			`}`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00
			`/**`
			`*`
			`* new UrlValidator().reformat("https://example.com/page?fbclid=123456&utm_source=mastodon") // => "https://example.com/page"`
			`*/`
More refactoring 2023-03-29 17:21:20 +02:00			`reformat(str: string): string {`
			`try {`
			`let url: URL`
			`// str = str.toLowerCase() // URLS are case sensitive. Lowercasing them might break some URLS. See #763`
			`if (`
			`!str.startsWith("http://") &&`
			`!str.startsWith("https://") &&`
			`!str.startsWith("http:")`
			`) {`
			`url = new URL("https://" + str)`
			`} else {`
			`url = new URL(str)`
			`}`
Add linked data module which scrapes websites 2024-02-22 18:58:34 +01:00			`if (this._forceHttps) {`
			`url.protocol = "https:"`
			`}`
More refactoring 2023-03-29 17:21:20 +02:00			`const blacklistedTrackingParams = [`
			`"fbclid", // Oh god, how I hate the fbclid. Let it burn, burn in hell!`
			`"gclid",`
			`"cmpid",`
			`"agid",`
			`"utm",`
			`"utm_source",`
			`"utm_medium",`
			`"campaignid",`
			`"campaign",`
			`"AdGroupId",`
			`"AdGroup",`
			`"TargetId",`
			`"msclkid",`
Add module to fetch data (via a proxy) from the website with jsonld 2024-02-26 02:24:46 +01:00			`"pk_source",`
			`"pk_medium",`
			`"pk_campaign",`
			`"pk_content",`
Chore: linting 2024-04-13 02:40:21 +02:00			`"pk_kwd",`
More refactoring 2023-03-29 17:21:20 +02:00			`]`
			`for (const dontLike of blacklistedTrackingParams) {`
			`url.searchParams.delete(dontLike.toLowerCase())`
			`}`
			`let cleaned = url.toString()`
			`if (cleaned.endsWith("/") && !str.endsWith("/")) {`
			`// Do not add a trailing '/' if it wasn't typed originally`
			`cleaned = cleaned.substr(0, cleaned.length - 1)`
			`}`

			`return cleaned`
			`} catch (e) {`
			`console.error(e)`
			`return undefined`
			`}`
			`}`

UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`/**`
			`*`
			`* const v = new UrlValidator()`
			`* v.getFeedback("example.").textFor("en") // => "This is not a valid web address"`
Fix tests 2024-08-25 02:50:28 +02:00			`* v.getFeedback("https://booking.com/some-hotel.html").textFor("en") // => Translations.t.validation.url.spamSite.Subs({host: "booking.com"}).textFor("en")`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`*/`
			`getFeedback(s: string, getCountry?: () => string): Translation \| undefined {`
chore: automated housekeeping... 2024-09-02 12:48:15 +02:00			`if (!s.startsWith("http://") && !s.startsWith("https://") && !s.startsWith("http:")) {`
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`s = "https://" + s`
			`}`
chore: automated housekeeping... 2024-09-02 12:48:15 +02:00			`try {`
Block aggregator websites 2024-08-23 21:21:27 +02:00			`const url = new URL(s)`
			`let host = url.host.toLowerCase()`
			`if (host.startsWith("www.")) {`
			`host = host.slice(4)`
			`}`
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`if (UrlValidator.spamWebsites.has(host)) {`
			`return Translations.t.validation.url.spamSite.Subs({ host })`
			`}`
			`if (UrlValidator.discouragedWebsites.has(host)) {`
Block aggregator websites 2024-08-23 21:21:27 +02:00			`return Translations.t.validation.url.aggregator.Subs({ host })`
			`}`
chore: automated housekeeping... 2024-09-02 12:48:15 +02:00			`} catch (e) {`
Block aggregator websites 2024-08-23 21:21:27 +02:00			`// pass`
			`}`
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`const upstream = super.getFeedback(s, getCountry)`
			`if (upstream) {`
			`return upstream`
			`}`
Block aggregator websites 2024-08-23 21:21:27 +02:00
UX: warn for aggregator websites, try to nudge https://wiki.openstreetmap.org/wiki/Organised_Editing/Activities/Trziste_prace to not use aggregator websites 2024-08-21 12:05:20 +02:00			`return undefined`
			`}`

Fix tests 2024-08-25 02:50:28 +02:00			`/**`
			`* const v = new UrlValidator()`
			`* v.isValid("https://booking.com/some-hotel.html") // => false`
			`*/`
More refactoring 2023-03-29 17:21:20 +02:00			`isValid(str: string): boolean {`
			`try {`
			`if (`
			`!str.startsWith("http://") &&`
			`!str.startsWith("https://") &&`
			`!str.startsWith("http:")`
			`) {`
			`str = "https://" + str`
			`}`
			`const url = new URL(str)`
Block aggregator websites 2024-08-23 21:21:27 +02:00
			`let host = url.host.toLowerCase()`
			`if (host.startsWith("www.")) {`
			`host = host.slice(4)`
			`}`
Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`if (UrlValidator.spamWebsites.has(host)) {`
Block aggregator websites 2024-08-23 21:21:27 +02:00			`return false`
			`}`

Improve URL-validation for blocked and discouraged sites 2024-08-24 01:50:34 +02:00			`const dotIndex = url.host.indexOf(".")`
More refactoring 2023-03-29 17:21:20 +02:00			`return dotIndex > 0 && url.host[url.host.length - 1] !== "."`
			`} catch (e) {`
			`return false`
			`}`
			`}`
			`}`