Here you can find the source of normalize(String url_str)
public static String normalize(String url_str) throws MalformedURLException, UnsupportedEncodingException
//package com.java2s; //License from project: Open Source License import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; public class Main { public static String normalize(String url_str) throws MalformedURLException, UnsupportedEncodingException { url_str = clean(url_str);// w w w. ja v a2 s.c o m url_str = removewww(url_str); url_str = translateWhiteSpaces(url_str); return url_str; } public static String clean(String url_str) throws MalformedURLException, UnsupportedEncodingException { url_str = url_str.toLowerCase().trim(); if (!url_str.startsWith("http") && !url_str.startsWith("ftp")) throw new MalformedURLException("URl does not start with http or ftp!: " + url_str); if (url_str.length() < 5) throw new MalformedURLException("Short URL: " + url_str); //remove sections if (url_str.contains("#")) url_str = url_str.substring(0, url_str.lastIndexOf("#")); try { url_str = URLDecoder.decode(url_str, "UTF-8"); } catch (IllegalArgumentException exp) { if (!exp.getMessage().contains("Illegal hex characters in escape (%) pattern") && !exp.getMessage().contains("Incomplete trailing escape (%) pattern")) throw exp; } URL url = new URL(url_str); url_str = url.toString(); while (url_str.endsWith("/")) url_str = url_str.substring(0, url_str.length() - 1); if (url_str.length() < 10) throw new MalformedURLException("Short URL: " + url_str); return url_str; } public static String removewww(String url) { if (url.startsWith("http://www.")) return url.replaceFirst("http://www\\.", "http://"); if (url.startsWith("https://www.")) return url.replaceFirst("https://www\\.", "https://"); if (url.startsWith("ftp://www.")) return url.replaceFirst("ftp://www\\.", "ftp://"); return url; } public static String translateWhiteSpaces(String url) { //return url.replaceAll("\\s",""); return url.replaceAll(" ", "%20").replaceAll("\t", "%09").replaceAll("\n", "%0A"); } }