From 8b44efb463791c075849e5d5d1558ae475197eca Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 24 Jan 2022 13:51:56 +0100 Subject: [PATCH] Minor: Added disregarded protocols from https://github.com/yasserg/crawler4j/pull/446 --- .../java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java index 584ab269d..0fca61cce 100644 --- a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java +++ b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java @@ -126,8 +126,12 @@ private Set getOutgoingUrls(String contextURL, HtmlContentHandler conten } String hrefLoweredCase = href.trim().toLowerCase(Locale.ROOT); - if (!hrefLoweredCase.contains("javascript:") && - !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) { + if (!hrefLoweredCase.contains("about:") && !hrefLoweredCase.contains("tel:") && + !hrefLoweredCase.contains("data:") && !hrefLoweredCase.contains("whatsapp:") && + !hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("viber:") && + !hrefLoweredCase.contains("sms:") && !hrefLoweredCase.contains("android-app:") && + !hrefLoweredCase.contains("fb-messenger:") && !hrefLoweredCase.contains("mailto:") && + !hrefLoweredCase.contains("@") && !hrefLoweredCase.contains("fb-messenger:")) { String url = normalizer.filter(UrlResolver.resolveUrl((contextURL == null) ? "" : contextURL, href)); if (url != null) { WebURL webURL = factory.newWebUrl();