List of usage examples for java.text Normalizer isNormalized
public static boolean isNormalized(CharSequence src, Form form)
From source file:org.w3.i18n.ParsedDocument.java
public ParsedDocument(DocumentResource documentResource) { if (documentResource == null) { throw new NullPointerException("documentResource: " + documentResource); }/*from w w w .ja va 2 s.c o m*/ // Prepare resources: this.documentResource = documentResource; // TODO: Currently a blocking operation. byte[] documentBodyBytes; try { documentBodyBytes = IOUtils.toByteArray(documentResource.getBody()); } catch (IOException ex) { throw new RuntimeException(ex); } this.byteOrderMark = documentBodyBytes.length <= 5 ? null : Utils.findByteOrderMark(Arrays.copyOf(documentBodyBytes, 4)); this.documentBody = new String(documentBodyBytes, byteOrderMark == null ? Charset.forName("UTF-8") : Charset.forName(byteOrderMark.getCharsetName())); // Use the HTML parser on the document body. this.source = new Source(documentBody); this.source.fullSequentialParse(); // Process resources to find information. // Pattern for finding charset in "http-equiv=Content-Type" meta tags. Pattern contentTypeCharsetPattern = Pattern.compile("charset\\s*=\\s*([^\\s;]+)"); // Document Type Declaration (DOCTYPE). // NB: In a valid document there is only one DTD. StartTag doctypeStartTag = source.getFirstStartTag(StartTagType.DOCTYPE_DECLARATION); this.doctypeTag = doctypeStartTag == null ? null : doctypeStartTag.toString().trim(); this.doctypeClassification = classifyDoctype(doctypeTag); // XML Declaration ('?xml' tag at the start of the document). StartTag xmlStartTag = source.getFirstStartTag(StartTagType.XML_DECLARATION); if (xmlStartTag != null) { this.xmlDeclaration = xmlStartTag.toString().trim(); Matcher charsetMatcher = Pattern.compile("encoding\\s*=\\s*'?\"?\\s*([^\"'\\s\\?>]+)") .matcher(this.xmlDeclaration); this.charsetXmlDeclaration = charsetMatcher.find() ? charsetMatcher.group(1) : null; } else { this.xmlDeclaration = null; this.charsetXmlDeclaration = null; } // Find all charset declarations in meta tags. this.charsetMetaTags = new TreeMap<>(); this.charsetMetaTagsOutside1024 = new ArrayList<>(); List<Element> metaElements = source.getAllElements("meta"); for (Element metaElement : metaElements) { String charset = null; // Look for a "<meta charset="..." >" tag. if (metaElement.getAttributeValue("charset") != null) { charset = metaElement.getAttributeValue("charset").trim(); } // Look for a "<meta http-equiv="Content-Type" ... >" tag. else { String httpEquiv = metaElement.getAttributeValue("http-equiv"); String content = metaElement.getAttributeValue("content"); if (httpEquiv != null && content != null && httpEquiv.equalsIgnoreCase("Content-Type")) { Matcher m = contentTypeCharsetPattern.matcher(content); if (m.find()) { charset = m.group(1); } } } // If a charset declaration was found, add this tag to the list. if (charset != null) { charset = charset.trim(); if (!charset.isEmpty()) { String tag = metaElement.getStartTag().toString().trim(); if (!charsetMetaTags.containsKey(charset)) { charsetMetaTags.put(charset, new ArrayList<String>()); } charsetMetaTags.get(charset).add(tag); if (metaElement.getEnd() > 1024) { charsetMetaTagsOutside1024.add(metaElement.getStartTag().toString()); } } } } // Find the 'Content-Type' HTTP response header and process it. this.contentType = documentResource.getHeader("Content-Type"); if (contentType != null /* TODO: DEBUG! This is a workaround for passing tests that * don't detect a bug in the old checker. See: * http://qa-dev.w3.org/i18n-checker-test/check.php?uri=http%3A% * 2F%2Fwww.w3.org%2FInternational%2Ftests%2Fi18n-checker%2Fgene * rate%3Ftest%3D24%26format%3Dhtml%26serveas%3Dhtml * ~~~ Joe (Joseph.J.Short@gmail.com) */ && !contentType.equals("text/html;; charset=UTF-8")) { Matcher m = contentTypeCharsetPattern.matcher(contentType); this.charsetHttp = m.find() ? m.group(1) : null; this.servedAsXml = contentType.contains("application/xhtml+xml"); } else { this.charsetHttp = null; this.servedAsXml = false; } // Find the opening 'html' tag and look for some choice attributes. Element htmlElement = source.getFirstElement("html"); if (htmlElement != null) { this.openingHtmlTag = htmlElement.getStartTag().toString(); this.openingHtmlTagLang = htmlElement.getAttributeValue("lang"); this.openingHtmlTagXmlLang = htmlElement.getAttributeValue("xml:lang"); this.defaultDir = htmlElement.getAttributeValue("dir"); } else { this.openingHtmlTag = null; this.openingHtmlTagLang = null; this.openingHtmlTagXmlLang = null; this.defaultDir = null; } // Find the 'Content-Language' HTTP response header. this.contentLanguage = documentResource.getHeader("Content-Language"); // Find a 'meta' tag with 'http-equiv="Content-Language"'. /* TODO: Change this to a similar structure that the charset meta tags * are stored in. */ { int i = 0; String langMetaS = null; while (langMetaS == null && i < metaElements.size()) { if (metaElements.get(i).getAttributeValue("http-equiv") != null && metaElements.get(i) .getAttributeValue("http-equiv").equalsIgnoreCase("Content-Language")) { // NB: langMetaS will still be null if there is no content. langMetaS = metaElements.get(i).getAttributeValue("content"); } i++; } this.langMeta = langMetaS; } // Find class and id names that are non-ASCII or non-NFC. this.allNonNfcClassIdNames = new TreeSet<>(); this.allNonNfcClassIdTags = new ArrayList<>(); Set<Element> nonNfcClassIdNamesElements = new LinkedHashSet<>(); CharsetEncoder usAsciiEncoder = Charset.forName("US-ASCII").newEncoder(); for (Element element : source.getAllElements()) { Set<String> names = new TreeSet<>(); String classAttr = element.getAttributeValue("class"); String idAttr = element.getAttributeValue("id"); if (classAttr != null) { for (String className : classAttr.split(" ")) { if (!className.isEmpty()) { names.add(className); } } } if (idAttr != null) { String id = idAttr.trim(); if (!id.isEmpty()) { names.add(id); } } boolean nonNfcAscii = false; for (String name : names) { if (// If non-ASCII !usAsciiEncoder.canEncode(name) // ... or non-NFC (Unicode normalisation): || !Normalizer.isNormalized(name, Normalizer.Form.NFC)) { nonNfcAscii = true; allNonNfcClassIdNames.add(name); } } if (nonNfcAscii) { nonNfcClassIdNamesElements.add(element); } } for (Element element : nonNfcClassIdNamesElements) { this.allNonNfcClassIdTags.add(element.getStartTag().toString()); } // Find any BOMs in the content. this.bomsInContent = new ArrayList<>(); for (int i = 1; i < documentBodyBytes.length - 5; i++) { ByteOrderMark bom = Utils.findByteOrderMark(Arrays.copyOfRange(documentBodyBytes, i, i + 5)); if (bom != null) { // Add a context of 15 characters either side to the list. int startofContext = Math.max(0, i - 15); int endOfContext = Math.min(documentBodyBytes.length - 1, i + 20); try { /* The context will look something like: * " ... comes the BOM /???/. Ok, test that. ... " * * A BOM encoded in US-ASCII looks something like "???" * (depending on the number of code points it uses). */ bomsInContent.add((startofContext == 0 ? "\"" : "\" ... ") + new String(Arrays.copyOfRange(documentBodyBytes, startofContext, endOfContext), "US-ASCII").replaceAll("\\s+", " ") + (endOfContext == documentBodyBytes.length - 1 ? "\"" : " ... \"")); } catch (UnsupportedEncodingException ex) { throw new RuntimeException(ex); } i += 2; } } // Use the BOM to determine whether the document is in UTF-16. // NB: This is behaviour copied accross from the old project. if (byteOrderMark == null) { this.utf16 = false; } else { this.utf16 = byteOrderMark.getCharsetName().toUpperCase().matches(".*UTF-16.*"); } // Find all 'a' and 'link' tags with a 'charset' attribute. this.charsetLinkTags = new ArrayList<>(); for (Element element : source.getAllElements()) { if ((element.getName().toLowerCase().equals("a") || element.getName().toLowerCase().equals("link")) && element.getAttributeValue("charset") != null) { this.charsetLinkTags.add(element.getStartTag().toString().trim()); } } // Find 'bdo' tags without 'dir' attributes. this.bdoTagsWithoutDir = new ArrayList<>(); for (Element element : source.getAllElements("bdo")) { if (element.getAttributeValue("dir") == null) { bdoTagsWithoutDir.add(element.getStartTag().toString().trim()); } } // Find all 'b' and 'i' tags without a class name. this.bITagsWithoutClass = new ArrayList<>(); for (Element element : source.getAllElements()) { if ((element.getName().toLowerCase().equals("b") || element.getName().toLowerCase().equals("i"))) { String classAttr = element.getAttributeValue("class"); if (classAttr == null || classAttr.trim().isEmpty()) { String context = element.toString(); if (context.length() > 15) { context = context.substring(0, 14) + " ... "; } bITagsWithoutClass.add("\"" + context + "\""); } } } // Make aggregates of charset declarations. this.allCharsetDeclarations = new TreeSet<>(); this.inDocCharsetDeclarations = new TreeSet<>(); if (this.charsetHttp != null) { String d = this.charsetHttp.trim().toUpperCase(); this.allCharsetDeclarations.add(d); } if (this.byteOrderMark != null) { String d = this.byteOrderMark.getCharsetName().toUpperCase(); this.allCharsetDeclarations.add(d); this.inDocCharsetDeclarations.add(d); } if (this.charsetXmlDeclaration != null) { String d = this.charsetXmlDeclaration.trim().toUpperCase(); this.allCharsetDeclarations.add(d); this.inDocCharsetDeclarations.add(d); } for (String charset : charsetMetaTags.keySet()) { this.allCharsetDeclarations.add(charset.toUpperCase()); this.inDocCharsetDeclarations.add(charset.toUpperCase()); } this.nonUtf8CharsetDeclarations = new TreeSet<>(); for (String charsetDeclaration : this.allCharsetDeclarations) { if (!charsetDeclaration.equalsIgnoreCase("UTF-8")) { nonUtf8CharsetDeclarations.add(charsetDeclaration); } } // Make aggregates of language declarations. this.allConflictingLangAttributes = new ArrayList<>(); this.allLangAttributes = new TreeSet<>(); this.allXmlLangAttributes = new TreeSet<>(); this.allLangAttributeTags = new ArrayList<>(); this.allXmlLangAttributeTags = new ArrayList<>(); for (Element element : source.getAllElements()) { String langAttr = element.getAttributeValue("lang"); String xmlLangAttr = element.getAttributeValue("xml:lang"); String lang = null; String xmlLang = null; String tag = element.getStartTag().toString().trim(); if (langAttr != null) { lang = langAttr.trim(); if (!lang.isEmpty()) { allLangAttributes.add(lang); allLangAttributeTags.add(tag); } } if (xmlLangAttr != null) { xmlLang = xmlLangAttr.trim(); if (!xmlLang.isEmpty()) { allXmlLangAttributes.add(xmlLang); allXmlLangAttributeTags.add(tag); } } if (lang != null && xmlLang != null && !lang.equals(xmlLang)) { this.allConflictingLangAttributes.add(Arrays.asList(lang, xmlLang, tag)); } } // Find all values of dir attributes. this.allDirAttributes = new TreeSet<>(); for (Element element : source.getAllElements()) { if (element.getAttributeValue("dir") != null) { allDirAttributes.add(element.getAttributeValue("dir")); } } }
From source file:org.jets3t.service.utils.FileComparer.java
/** * Normalize string into "Normalization Form Canonical Decomposition" (NFD). * * References://ww w . jav a 2 s.c o m * http://stackoverflow.com/questions/3610013 * http://en.wikipedia.org/wiki/Unicode_equivalence * * @param str * @return string normalized into NFC form. */ protected String normalizeUnicode(String str) { Normalizer.Form form = Normalizer.Form.NFD; if (!Normalizer.isNormalized(str, form)) { return Normalizer.normalize(str, form); } return str; }
From source file:org.opensextant.util.TextUtils.java
/** * Normalize to "Normalization Form Canonical Decomposition" (NFD) REF: * http://from w ww . j a v a 2 s. c o m * //stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode- * names-with-jdk-6-unicode-normalization-issues This supports proper file * name retrieval from file system, among other things. In many situations * we see unicode file names -- Java can list them, but in using the * Java-provided version of the filename the OS/FS may not be able to find * the file by the name given in a particular normalized form. * * @param str * text * @return normalized string, encoded with NFD bytes */ public static String normalizeUnicode(String str) { Normalizer.Form form = Normalizer.Form.NFD; if (!Normalizer.isNormalized(str, form)) { return Normalizer.normalize(str, form); } return str; }
From source file:info.ajaxplorer.synchro.SyncJob.java
protected String normalizeUnicode(String str) { Normalizer.Form form = Normalizer.Form.NFD; if (!Normalizer.isNormalized(str, form)) { return Normalizer.normalize(str, form); }/* www. j a va2 s . c om*/ return str; }
From source file:de.innovationgate.utils.WGUtils.java
/** * performs a unicode normalization to NFC form (java.text.Normalizer.Form.NFC) for the given input * @param input The input string/*from w ww . j a va 2 s . com*/ * @return the normalized or original value if already NFC form */ public static String normalizeUnicode(String input) { if (input != null && !Normalizer.isNormalized(input, Normalizer.Form.NFC)) { return Normalizer.normalize(input, Normalizer.Form.NFC); } return input; }