Here you can find the source of NormalizeURL(String URL)
public static String NormalizeURL(String URL)
//package com.java2s; /*//from w ww . java 2 s . c o m * HTML-cleaner used in TREC 19,20,21 adhoc. * * Boytsov, L., Belova, A., 2011. Evaluating Learning-to-Rank Methods in the Web Track Adhoc Task. * In TREC-20: Proceedings of the Nineteenth Text REtrieval Conference. * * Author: Leonid Boytsov * Copyright (c) 2013 * * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. */ import java.net.URI; public class Main { public static String NormalizeURL(String URL) { URI uri; try { uri = new URI(URL); } catch (Exception e) { return URL.trim(); } String host = uri.getHost(); String scheme = uri.getScheme(); if (host == null || scheme == null || (!scheme.equals("http") && !scheme.equals("https") && !scheme .equals("ftp"))) { return URL.trim(); } String Path = uri.getPath(); if (Path == null || Path.isEmpty()) { Path = "/"; } try { uri = new URI(scheme, null /* user info */, host, uri.getPort(), Path, null /* query */, null /* fragment */); } catch (Exception e) { return URL.trim(); } ; return uri.toString().trim(); } }