List of usage examples for java.text Normalizer normalize
public static String normalize(CharSequence src, Form form)
From source file:com.linkedpipes.plugin.loader.dcatAp11ToCkanBatch.DcatAp11ToCkanBatch.java
@Override public void execute() throws LpException { apiURI = configuration.getApiUri();/* ww w.jav a2 s .co m*/ if (apiURI == null || apiURI.isEmpty() || configuration.getApiKey() == null || configuration.getApiKey().isEmpty()) { throw exceptionFactory.failure("Missing required settings."); } Map<String, String> organizations = getOrganizations(); LOG.debug("Querying metadata for datasets"); LinkedList<String> datasets = new LinkedList<>(); for (Map<String, Value> map : executeSelectQuery( "SELECT ?d WHERE {?d a <" + DcatAp11ToCkanBatchVocabulary.DCAT_DATASET_CLASS + ">}")) { datasets.add(map.get("d").stringValue()); } int current = 0; int total = datasets.size(); LOG.info("Found " + total + " datasets"); progressReport.start(total); for (String datasetURI : datasets) { current++; CloseableHttpResponse queryResponse = null; LOG.info("Processing dataset " + current + "/" + total + ": " + datasetURI); String datasetID = executeSimpleSelectQuery("SELECT ?did WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.LODCZCKAN_DATASET_ID + "> ?did }", "did"); if (datasetID.isEmpty()) { LOG.warn("Dataset " + datasetURI + " has missing CKAN ID"); continue; } boolean datasetExists = false; Map<String, String> resUrlIdMap = new HashMap<>(); Map<String, String> resDistroIdMap = new HashMap<>(); Map<String, JSONObject> resourceList = new HashMap<>(); LOG.debug("Querying for the dataset " + datasetID + " in CKAN"); HttpGet httpGet = new HttpGet(apiURI + "/package_show?id=" + datasetID); try { queryResponse = queryClient.execute(httpGet); if (queryResponse.getStatusLine().getStatusCode() == 200) { LOG.debug("Dataset found"); datasetExists = true; JSONObject response = new JSONObject(EntityUtils.toString(queryResponse.getEntity())) .getJSONObject("result"); JSONArray resourcesArray = response.getJSONArray("resources"); for (int i = 0; i < resourcesArray.length(); i++) { String id = resourcesArray.getJSONObject(i).getString("id"); resourceList.put(id, resourcesArray.getJSONObject(i)); String url = resourcesArray.getJSONObject(i).getString("url"); resUrlIdMap.put(url, id); if (resourcesArray.getJSONObject(i).has("distro_url")) { String distro = resourcesArray.getJSONObject(i).getString("distro_url"); resDistroIdMap.put(distro, id); } } } else { String ent = EntityUtils.toString(queryResponse.getEntity()); LOG.debug("Dataset not found: " + ent); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } finally { if (queryResponse != null) { try { queryResponse.close(); } catch (IOException e) { LOG.error(e.getLocalizedMessage(), e); } } } LinkedList<String> keywords = new LinkedList<>(); for (Map<String, Value> map : executeSelectQuery( "SELECT ?keyword WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_KEYWORD + "> ?keyword FILTER(LANGMATCHES(LANG(?keyword), \"" + configuration.getLoadLanguage() + "\"))}")) { keywords.add(map.get("keyword").stringValue()); } String publisher_uri = executeSimpleSelectQuery("SELECT ?publisher_uri WHERE {<" + datasetURI + "> <" + DCTERMS.PUBLISHER + "> ?publisher_uri }", "publisher_uri"); String publisher_name = executeSimpleSelectQuery( "SELECT ?publisher_name WHERE {<" + datasetURI + "> <" + DCTERMS.PUBLISHER + ">/<" + FOAF.NAME + "> ?publisher_name FILTER(LANGMATCHES(LANG(?publisher_name), \"" + configuration.getLoadLanguage() + "\"))}", "publisher_name"); if (!organizations.containsKey(publisher_uri)) { LOG.debug("Creating organization " + publisher_uri); JSONObject root = new JSONObject(); if (publisher_name == null || publisher_name.isEmpty()) { throw exceptionFactory.failure("Organization has no name: " + publisher_uri); } root.put("title", publisher_name); String orgname = Normalizer.normalize(publisher_name, Normalizer.Form.NFD) .replaceAll("\\P{InBasic_Latin}", "").replace(' ', '-').replace('.', '-').toLowerCase(); root.put("name", orgname); JSONArray org_extras = new JSONArray(); org_extras.put(new JSONObject().put("key", "uri").put("value", publisher_uri)); root.put("extras", org_extras); HttpPost httpPost = new HttpPost(apiURI + "/organization_create"); httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey())); String json = root.toString(); httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8"))); CloseableHttpResponse response = null; try { response = postClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { LOG.debug("Organization created OK"); //LOG.info("Response: " + EntityUtils.toString(response.getEntity())); organizations.put(publisher_uri, orgname); } else if (response.getStatusLine().getStatusCode() == 409) { String ent = EntityUtils.toString(response.getEntity()); LOG.error("Organization conflict: " + ent); throw exceptionFactory.failure("Organization conflict: " + ent); } else { String ent = EntityUtils.toString(response.getEntity()); LOG.error("Response:" + ent); throw exceptionFactory.failure("Error creating organization: " + ent); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } finally { if (response != null) { try { response.close(); } catch (IOException e) { LOG.error(e.getLocalizedMessage(), e); throw exceptionFactory.failure("Error creating dataset"); } } } } LOG.debug("Creating JSON"); JSONObject root = new JSONObject(); JSONArray tags = new JSONArray(); for (String keyword : keywords) { String safekeyword = fixKeyword(keyword); if (safekeyword.length() >= 2) { tags.put(new JSONObject().put("name", safekeyword)); } } root.put("tags", tags); JSONArray resources = new JSONArray(); if (!datasetID.isEmpty()) { root.put("name", datasetID); } String title = executeSimpleSelectQuery("SELECT ?title WHERE {<" + datasetURI + "> <" + DCTERMS.TITLE + "> ?title FILTER(LANGMATCHES(LANG(?title), \"" + configuration.getLoadLanguage() + "\"))}", "title"); if (!title.isEmpty()) { root.put("title", title); } String description = executeSimpleSelectQuery("SELECT ?description WHERE {<" + datasetURI + "> <" + DCTERMS.DESCRIPTION + "> ?description FILTER(LANGMATCHES(LANG(?description), \"" + configuration.getLoadLanguage() + "\"))}", "description"); if (!description.isEmpty()) { root.put("notes", description); } String contactPoint = executeSimpleSelectQuery("SELECT ?contact WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_CONTACT_POINT + ">/<" + DcatAp11ToCkanBatchVocabulary.VCARD_HAS_EMAIL + "> ?contact }", "contact"); if (!contactPoint.isEmpty()) { root.put("maintainer_email", contactPoint); } String curatorName = executeSimpleSelectQuery( "SELECT ?name WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_CONTACT_POINT + ">/<" + DcatAp11ToCkanBatchVocabulary.VCARD_FN + "> ?name }", "name"); if (!curatorName.isEmpty()) { root.put("maintainer", curatorName); } String issued = executeSimpleSelectQuery( "SELECT ?issued WHERE {<" + datasetURI + "> <" + DCTERMS.ISSUED + "> ?issued }", "issued"); if (!issued.isEmpty()) { root.put("metadata_created", issued); } String modified = executeSimpleSelectQuery( "SELECT ?modified WHERE {<" + datasetURI + "> <" + DCTERMS.MODIFIED + "> ?modified }", "modified"); if (!modified.isEmpty()) { root.put("metadata_modified", modified); } if (configuration.getProfile().equals(DcatAp11ToCkanBatchVocabulary.PROFILES_NKOD.stringValue())) { if (!publisher_uri.isEmpty()) { root.put("publisher_uri", publisher_uri); } if (!publisher_name.isEmpty()) { root.put("publisher_name", publisher_name); } String periodicity = executeSimpleSelectQuery("SELECT ?periodicity WHERE {<" + datasetURI + "> <" + DCTERMS.ACCRUAL_PERIODICITY + "> ?periodicity }", "periodicity"); if (!periodicity.isEmpty()) { root.put("frequency", periodicity); } String temporalStart = executeSimpleSelectQuery( "SELECT ?temporalStart WHERE {<" + datasetURI + "> <" + DCTERMS.TEMPORAL + ">/<" + DcatAp11ToCkanBatchVocabulary.SCHEMA_STARTDATE + "> ?temporalStart }", "temporalStart"); if (!temporalStart.isEmpty()) { root.put("temporal_start", temporalStart); } String temporalEnd = executeSimpleSelectQuery( "SELECT ?temporalEnd WHERE {<" + datasetURI + "> <" + DCTERMS.TEMPORAL + ">/<" + DcatAp11ToCkanBatchVocabulary.SCHEMA_ENDDATE + "> ?temporalEnd }", "temporalEnd"); if (!temporalEnd.isEmpty()) { root.put("temporal_end", temporalEnd); } String schemaURL = executeSimpleSelectQuery( "SELECT ?schema WHERE {<" + datasetURI + "> <" + FOAF.PAGE + "> ?schema }", "schema"); if (!schemaURL.isEmpty()) { root.put("schema", schemaURL); } String spatial = executeSimpleSelectQuery( "SELECT ?spatial WHERE {<" + datasetURI + "> <" + DCTERMS.SPATIAL + "> ?spatial }", "spatial"); if (!spatial.isEmpty()) { root.put("spatial_uri", spatial); } LinkedList<String> themes = new LinkedList<>(); for (Map<String, Value> map : executeSelectQuery("SELECT ?theme WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_THEME + "> ?theme }")) { themes.add(map.get("theme").stringValue()); } String concatThemes = ""; for (String theme : themes) { concatThemes += theme + " "; } if (!concatThemes.isEmpty()) root.put("theme", concatThemes); } //Distributions LinkedList<String> distributions = new LinkedList<>(); for (Map<String, Value> map : executeSelectQuery("SELECT ?distribution WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_DISTRIBUTION + "> ?distribution }")) { distributions.add(map.get("distribution").stringValue()); } for (String distribution : distributions) { JSONObject distro = new JSONObject(); String dtitle = executeSimpleSelectQuery("SELECT ?title WHERE {<" + distribution + "> <" + DCTERMS.TITLE + "> ?title FILTER(LANGMATCHES(LANG(?title), \"" + configuration.getLoadLanguage() + "\"))}", "title"); if (!dtitle.isEmpty()) { distro.put("name", dtitle); } String ddescription = executeSimpleSelectQuery("SELECT ?description WHERE {<" + distribution + "> <" + DCTERMS.DESCRIPTION + "> ?description FILTER(LANGMATCHES(LANG(?description), \"" + configuration.getLoadLanguage() + "\"))}", "description"); if (!ddescription.isEmpty()) { distro.put("description", ddescription); } //DCAT-AP v1.1: has to be am IRI from http://publications.europa.eu/mdr/authority/file-type/index.html String dformat = executeSimpleSelectQuery( "SELECT ?format WHERE {<" + distribution + "> <" + DCTERMS.FORMAT + "> ?format }", "format"); if (!dformat.isEmpty() && codelists != null) { String formatlabel = executeSimpleCodelistSelectQuery( "SELECT ?formatlabel WHERE {<" + dformat + "> <" + SKOS.PREF_LABEL + "> ?formatlabel FILTER(LANGMATCHES(LANG(?formatlabel), \"en\"))}", "formatlabel"); if (!formatlabel.isEmpty()) { distro.put("format", formatlabel); } } String dwnld = executeSimpleSelectQuery("SELECT ?dwnld WHERE {<" + distribution + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_DOWNLOADURL + "> ?dwnld }", "dwnld"); String access = executeSimpleSelectQuery("SELECT ?acc WHERE {<" + distribution + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_ACCESSURL + "> ?acc }", "acc"); //we prefer downloadURL, but only accessURL is mandatory if (dwnld == null || dwnld.isEmpty()) { dwnld = access; if (dwnld == null || dwnld.isEmpty()) { LOG.warn("Empty download and access URLs: " + datasetURI); continue; } } if (!dwnld.isEmpty()) { distro.put("url", dwnld); } if (!distribution.isEmpty()) { distro.put("distro_url", distribution); } distro.put("resource_type", "file"); if (resDistroIdMap.containsKey(distribution)) { String id = resDistroIdMap.get(distribution); distro.put("id", id); resourceList.remove(id); } else if (resUrlIdMap.containsKey(dwnld)) { String id = resUrlIdMap.get(dwnld); distro.put("id", id); resourceList.remove(id); } String dissued = executeSimpleSelectQuery( "SELECT ?issued WHERE {<" + distribution + "> <" + DCTERMS.ISSUED + "> ?issued }", "issued"); if (!dissued.isEmpty()) { distro.put("created", dissued); } String dmodified = executeSimpleSelectQuery( "SELECT ?modified WHERE {<" + distribution + "> <" + DCTERMS.MODIFIED + "> ?modified }", "modified"); if (!dmodified.isEmpty()) { distro.put("last_modified", dmodified); } if (configuration.getProfile().equals(DcatAp11ToCkanBatchVocabulary.PROFILES_NKOD.stringValue())) { String dtemporalStart = executeSimpleSelectQuery( "SELECT ?temporalStart WHERE {<" + distribution + "> <" + DCTERMS.TEMPORAL + ">/<" + DcatAp11ToCkanBatchVocabulary.SCHEMA_STARTDATE + "> ?temporalStart }", "temporalStart"); if (!dtemporalStart.isEmpty()) { distro.put("temporal_start", dtemporalStart); } String dtemporalEnd = executeSimpleSelectQuery( "SELECT ?temporalEnd WHERE {<" + distribution + "> <" + DCTERMS.TEMPORAL + ">/<" + DcatAp11ToCkanBatchVocabulary.SCHEMA_ENDDATE + "> ?temporalEnd }", "temporalEnd"); if (!dtemporalEnd.isEmpty()) { distro.put("temporal_end", dtemporalEnd); } String dspatial = executeSimpleSelectQuery( "SELECT ?spatial WHERE {<" + distribution + "> <" + DCTERMS.SPATIAL + "> ?spatial }", "spatial"); if (!dspatial.isEmpty()) { root.put("spatial_uri", dspatial); } String dschemaURL = executeSimpleSelectQuery( "SELECT ?schema WHERE {<" + distribution + "> <" + DCTERMS.CONFORMS_TO + "> ?schema }", "schema"); if (!dschemaURL.isEmpty()) { distro.put("describedBy", dschemaURL); } String dlicense = executeSimpleSelectQuery( "SELECT ?license WHERE {<" + distribution + "> <" + DCTERMS.LICENSE + "> ?license }", "license"); if (!dlicense.isEmpty()) { distro.put("license_link", dlicense); } String dmimetype = executeSimpleSelectQuery("SELECT ?format WHERE {<" + distribution + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_MEDIATYPE + "> ?format }", "format"); if (!dmimetype.isEmpty()) { distro.put("mimetype", dmimetype.replaceAll(".*\\/([^\\/]+\\/[^\\/]+)", "$1")); } } resources.put(distro); } //Add the remaining distributions that were not updated but existed in the original dataset for (Entry<String, JSONObject> resource : resourceList.entrySet()) { resources.put(resource.getValue()); } root.put("resources", resources); //Create new dataset if (!datasetExists) { JSONObject createRoot = new JSONObject(); CloseableHttpResponse response = null; createRoot.put("name", datasetID); createRoot.put("title", title); createRoot.put("owner_org", organizations.get(publisher_uri)); LOG.debug("Creating dataset in CKAN"); HttpPost httpPost = new HttpPost(apiURI + "/package_create?id=" + datasetID); httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey())); String json = createRoot.toString(); LOG.debug("Creating dataset with: " + json); httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8"))); try { response = createClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { LOG.debug("Dataset created OK"); //LOG.info("Response: " + EntityUtils.toString(response.getEntity())); } else if (response.getStatusLine().getStatusCode() == 409) { String ent = EntityUtils.toString(response.getEntity()); LOG.error("Dataset already exists: " + ent); throw exceptionFactory.failure("Dataset already exists"); } else { String ent = EntityUtils.toString(response.getEntity()); LOG.error("Response:" + ent); throw exceptionFactory.failure("Error creating dataset"); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } finally { if (response != null) { try { response.close(); } catch (IOException e) { LOG.error(e.getLocalizedMessage(), e); throw exceptionFactory.failure("Error creating dataset"); } } } } //Update existing dataset String json = root.toString(); LOG.debug("Posting to CKAN"); HttpPost httpPost = new HttpPost(apiURI + "/package_update?id=" + datasetID); httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey())); LOG.debug(json); httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8"))); CloseableHttpResponse response = null; try { response = postClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { //LOG.info("Response:" + EntityUtils.toString(response.getEntity())); } else { String ent = EntityUtils.toString(response.getEntity()); LOG.error("Response:" + ent); throw exceptionFactory.failure("Error updating dataset"); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } finally { if (response != null) { try { response.close(); } catch (IOException e) { LOG.error(e.getLocalizedMessage(), e); throw exceptionFactory.failure("Error updating dataset"); } } } progressReport.entryProcessed(); } try { queryClient.close(); createClient.close(); postClient.close(); } catch (IOException e) { LOG.error(e.getLocalizedMessage(), e); } progressReport.done(); }
From source file:io.github.swagger2markup.markup.builder.internal.AbstractMarkupDocBuilder.java
protected String normalizeAnchor(Markup spaceEscape, String anchor) { String normalizedAnchor = defaultString(anchorPrefix) + anchor.trim(); normalizedAnchor = Normalizer.normalize(normalizedAnchor, Normalizer.Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); normalizedAnchor = ANCHOR_IGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll(spaceEscape.toString()); normalizedAnchor = normalizedAnchor/*from w w w . j a v a2s. co m*/ .replaceAll(String.format("([%1$s])([%1$s]+)", ANCHOR_SEPARATION_CHARACTERS), "$1"); normalizedAnchor = StringUtils.strip(normalizedAnchor, ANCHOR_SEPARATION_CHARACTERS); normalizedAnchor = normalizedAnchor.trim().toLowerCase(); String validAnchor = ANCHOR_UNIGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll(""); if (validAnchor.length() != normalizedAnchor.length()) normalizedAnchor = DigestUtils.md5Hex(normalizedAnchor); else normalizedAnchor = validAnchor; return normalizedAnchor; }
From source file:com.ved.musicmapapp.LoginAcitivity.java
public String unAccent(String s) { String temp = Normalizer.normalize(s, Normalizer.Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); return pattern.matcher(temp).replaceAll("").replaceAll("?", "D").replace("", ""); }
From source file:com.joliciel.talismane.tokeniser.filters.TokenRegexFilterImpl.java
Pattern getPattern() { if (pattern == null) { // we may need to replace WordLists by the list contents String myRegex = this.regex; if (LOG.isTraceEnabled()) { LOG.trace("Regex: " + myRegex); }//www . j av a 2 s .c o m if (this.autoWordBoundaries) { Boolean startsWithLetter = null; for (int i = 0; i < myRegex.length() && startsWithLetter == null; i++) { char c = myRegex.charAt(i); if (c == '\\') { i++; c = myRegex.charAt(i); if (c == 'd' || c == 'w') { startsWithLetter = true; } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') { startsWithLetter = false; } else if (c == 'p') { i += 2; // skip the open curly brackets int closeCurlyBrackets = myRegex.indexOf('}', i); int openParentheses = myRegex.indexOf('(', i); int endIndex = closeCurlyBrackets; if (openParentheses > 0 && openParentheses < closeCurlyBrackets) endIndex = openParentheses; if (endIndex > 0) { String specialClass = myRegex.substring(i, endIndex); if (specialClass.equals("WordList")) { startsWithLetter = true; } } } break; } else if (c == '[' || c == '(') { // do nothing } else if (Character.isLetter(c) || Character.isDigit(c)) { startsWithLetter = true; } else { startsWithLetter = false; } } Boolean endsWithLetter = null; for (int i = myRegex.length() - 1; i >= 0 && endsWithLetter == null; i--) { char c = myRegex.charAt(i); char prevC = ' '; if (i >= 1) prevC = myRegex.charAt(i - 1); if (prevC == '\\') { if (c == 'd' || c == 'w') { endsWithLetter = true; } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') { endsWithLetter = false; } else if (c == 'p') { i += 2; // skip the open curly brackets int closeCurlyBrackets = myRegex.indexOf('}', i); int openParentheses = myRegex.indexOf('(', i); int endIndex = closeCurlyBrackets; if (openParentheses < closeCurlyBrackets) endIndex = openParentheses; if (endIndex > 0) { String specialClass = myRegex.substring(i, endIndex); if (specialClass.equals("WordList") || specialClass.equals("Alpha") || specialClass.equals("Lower") || specialClass.equals("Upper") || specialClass.equals("ASCII") || specialClass.equals("Digit")) { startsWithLetter = true; } } } break; } else if (c == ']' || c == ')' || c == '+') { // do nothing } else if (c == '}') { int startIndex = myRegex.lastIndexOf('{') + 1; int closeCurlyBrackets = myRegex.indexOf('}', startIndex); int openParentheses = myRegex.indexOf('(', startIndex); int endIndex = closeCurlyBrackets; if (openParentheses > 0 && openParentheses < closeCurlyBrackets) endIndex = openParentheses; if (endIndex > 0) { String specialClass = myRegex.substring(startIndex, endIndex); if (specialClass.equals("WordList") || specialClass.equals("Alpha") || specialClass.equals("Lower") || specialClass.equals("Upper") || specialClass.equals("ASCII") || specialClass.equals("Digit")) { endsWithLetter = true; } } break; } else if (Character.isLetter(c) || Character.isDigit(c)) { endsWithLetter = true; } else { endsWithLetter = false; } } if (startsWithLetter != null && startsWithLetter) { myRegex = "\\b" + myRegex; } if (endsWithLetter != null && endsWithLetter) { myRegex = myRegex + "\\b"; } if (LOG.isTraceEnabled()) { LOG.trace("After autoWordBoundaries: " + myRegex); } } if (!this.caseSensitive || !this.diacriticSensitive) { StringBuilder regexBuilder = new StringBuilder(); for (int i = 0; i < myRegex.length(); i++) { char c = myRegex.charAt(i); if (c == '\\') { // escape - skip next regexBuilder.append(c); i++; c = myRegex.charAt(i); regexBuilder.append(c); } else if (c == '[') { // character group, don't change it regexBuilder.append(c); while (c != ']' && i < myRegex.length()) { i++; c = myRegex.charAt(i); regexBuilder.append(c); } } else if (c == '{') { // command, don't change it regexBuilder.append(c); while (c != '}' && i < myRegex.length()) { i++; c = myRegex.charAt(i); regexBuilder.append(c); } } else if (Character.isLetter(c)) { Set<String> chars = new TreeSet<String>(); chars.add("" + c); char noAccent = diacriticPattern.matcher(Normalizer.normalize("" + c, Form.NFD)) .replaceAll("").charAt(0); if (!this.caseSensitive) { chars.add("" + Character.toUpperCase(c)); chars.add("" + Character.toLowerCase(c)); chars.add("" + Character.toUpperCase(noAccent)); } if (!this.diacriticSensitive) { chars.add("" + noAccent); if (!this.caseSensitive) { chars.add("" + Character.toLowerCase(noAccent)); } } if (chars.size() == 1) { regexBuilder.append(c); } else { regexBuilder.append('['); for (String oneChar : chars) { regexBuilder.append(oneChar); } regexBuilder.append(']'); } } else { regexBuilder.append(c); } } myRegex = regexBuilder.toString(); if (LOG.isTraceEnabled()) { LOG.trace("After caseSensitive: " + myRegex); } } Matcher matcher = wordListPattern.matcher(myRegex); StringBuilder regexBuilder = new StringBuilder(); int lastIndex = 0; while (matcher.find()) { String[] params = matcher.group(1).split(","); int start = matcher.start(); int end = matcher.end(); regexBuilder.append(myRegex.substring(lastIndex, start)); String wordListName = params[0]; boolean uppercaseOptional = false; boolean diacriticsOptional = false; boolean lowercaseOptional = false; boolean firstParam = true; for (String param : params) { if (firstParam) { /* word list name */ } else if (param.equals("diacriticsOptional")) diacriticsOptional = true; else if (param.equals("uppercaseOptional")) uppercaseOptional = true; else if (param.equals("lowercaseOptional")) lowercaseOptional = true; else throw new TalismaneException( "Unknown parameter in word list " + matcher.group(1) + ": " + param); firstParam = false; } ExternalWordList wordList = externalResourceFinder.getExternalWordList(wordListName); if (wordList == null) throw new TalismaneException("Unknown word list: " + wordListName); StringBuilder sb = new StringBuilder(); boolean firstWord = true; for (String word : wordList.getWordList()) { if (!firstWord) sb.append("|"); word = Normalizer.normalize(word, Form.NFC); if (uppercaseOptional || diacriticsOptional) { String wordNoDiacritics = Normalizer.normalize(word, Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); String wordLowercase = word.toLowerCase(Locale.ENGLISH); String wordLowercaseNoDiacritics = Normalizer.normalize(wordLowercase, Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); String wordUppercase = wordNoDiacritics.toUpperCase(Locale.ENGLISH); boolean needsGrouping = false; if (uppercaseOptional && !word.equals(wordLowercase)) needsGrouping = true; if (diacriticsOptional && !word.equals(wordNoDiacritics)) needsGrouping = true; if (lowercaseOptional && !word.equals(wordUppercase)) needsGrouping = true; if (needsGrouping) { for (int i = 0; i < word.length(); i++) { char c = word.charAt(i); boolean grouped = false; if (uppercaseOptional && c != wordLowercase.charAt(i)) grouped = true; if (diacriticsOptional && c != wordNoDiacritics.charAt(i)) grouped = true; if (lowercaseOptional && c != wordUppercase.charAt(i)) grouped = true; if (!grouped) sb.append(c); else { sb.append("["); String group = "" + c; if (uppercaseOptional && group.indexOf(wordLowercase.charAt(i)) < 0) group += (wordLowercase.charAt(i)); if (lowercaseOptional && group.indexOf(wordUppercase.charAt(i)) < 0) group += (wordUppercase.charAt(i)); if (diacriticsOptional && group.indexOf(wordNoDiacritics.charAt(i)) < 0) group += (wordNoDiacritics.charAt(i)); if (uppercaseOptional && diacriticsOptional && group.indexOf(wordLowercaseNoDiacritics.charAt(i)) < 0) group += (wordLowercaseNoDiacritics.charAt(i)); sb.append(group); sb.append("]"); } // does this letter need grouping? } // next letter } else { sb.append(word); } // any options activated? } else { sb.append(word); } firstWord = false; } // next word in list regexBuilder.append(sb.toString()); lastIndex = end; } // next match regexBuilder.append(myRegex.substring(lastIndex)); myRegex = regexBuilder.toString(); this.pattern = Pattern.compile(myRegex, Pattern.UNICODE_CHARACTER_CLASS); } return pattern; }
From source file:org.rascalmpl.library.cobra.RandomValueTypeVisitor.java
@Override public IValue visitString(Type type) { if (maxDepth <= 0 || (stRandom.nextInt(2) == 0)) { return vf.string(""); } else {// w ww . j ava 2 s . c o m RandomValueTypeVisitor visitor = descend(); IString str = vf.string(visitor.generate(type).toString()); IString result = str.concat(vf.string(RandomStringUtils.random(1))); // make sure we are not generating very strange sequences String normalized = Normalizer.normalize(result.getValue(), Form.NFC); return vf.string(normalized); } }
From source file:org.openremote.foxycart.resources.FoxyCartResource.java
private String removeDiacritics(String text) { // characters text = text.replace("\u00E4", "ae"); text = text.replace("\u00F1", "ny"); text = text.replace("\u00F6", "oe"); text = text.replace("\u00FC", "ue"); text = text.replace("\u00FF", "yu"); text = Normalizer.normalize(text, Normalizer.Form.NFD); text = text.replaceAll("\\p{M}", ""); text = text.replace("\u00DF", "ss"); text = text.replace("\u00C6", "AE"); text = text.replace("\u00E6", "ae"); text = text.replace("\u0132", "IJ"); text = text.replace("\u0133", "ij"); text = text.replace("\u0152", "Oe"); text = text.replace("\u0153", "oe"); // ??/*from w w w . ja v a 2 s . co m*/ text = text.replace("\u00D0", "D"); text = text.replace("\u0110", "D"); text = text.replace("\u00F0", "d"); text = text.replace("\u0111", "d"); text = text.replace("\u0126", "H"); text = text.replace("\u0127", "h"); // ? text = text.replace("\u0131", "i"); text = text.replace("\u0138", "k"); text = text.replace("\u013F", "L"); text = text.replace("\u0141", "L"); text = text.replace("\u0140", "l"); text = text.replace("\u0142", "l"); // text = text.replace("\u014A", "N"); text = text.replace("\u0149", "n"); text = text.replace("\u014B", "n"); text = text.replace("\u00D8", "O"); text = text.replace("\u00F8", "o"); text = text.replace("\u017F", "s"); // text = text.replace("\u00DE", "T"); text = text.replace("\u0166", "T"); text = text.replace("\u00FE", "t"); text = text.replace("\u0167", "t"); return text; }
From source file:org.dspace.installer_edm.InstallerEDMBase.java
/** * quitar acentos a una cadena// w ww .j a v a 2 s . co m * * @param text cadena original * @return cadena sin acentos */ public String removeAccents(String text) { if (text != null) text = text.replaceAll(" +", "_"); return text == null ? null : Normalizer.normalize(text, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); }
From source file:org.structr.core.graph.search.Search.java
/** * Normalize special characters to ASCII * * @param input//from w w w .j a va2 s.c o m * @return */ public static String normalize(final String input) { String normalized = Normalizer.normalize(input, Normalizer.Form.NFD); return normalized.replaceAll("[^\\p{ASCII}]", ""); }
From source file:de.pangaea.fixo3.xml.ProcessXmlFiles.java
private String toAscii(String s) { s = Normalizer.normalize(s, Normalizer.Form.NFD); return s.replaceAll("[^\\x00-\\x7F]", ""); }
From source file:com.cloudbees.hudson.plugins.folder.ChildNameGeneratorTest.java
private void checkComputedFolder(ComputedFolderImpl instance, int round, Normalizer.Form form) throws IOException { assertThat("We detected the filesystem normalization form", form, notNullValue()); instance.assertItemNames(round, "$$child-one", "$$child_two", "$$child three", "$$leanbh cu\u0301ig", "$$ ?", "$$", "$$\u110b\u1161\u110b\u1175 7", "$$nin\u0303o ocho"); instance.assertItemShortUrls(round, "job/$$child-one/", "job/$$child_two/", "job/$$child%20three/", "job/$$leanbh%20cu%CC%81ig/", "job/$$%D1%80%D0%B5%D0%B1%D0%B5%D0%BD%D0%BE%D0%BA%20%D0%BF%D1%8F%D1%82%D1%8C/", // ? "job/$$%E5%84%BF%E7%AB%A5%E5%85%AD/", // "job/$$%E1%84%8B%E1%85%A1%E1%84%8B%E1%85%B5%207/", // ? 7 "job/$$nin%CC%83o%20ocho/"); switch (form) { case NFC:/*from w w w .j a va2 s .c o m*/ case NFKC: instance.assertItemDirs(round, "child_on-1ec93354e47959489d1440d", "child_tw-bca7d461e11f4f3ed12fd0d", "child_th-b7a6e5662f26eb036090308", "leanbh_c-cde398abd1bc432e87c49ca", "________-97e4b38574769f9d9968fe9", // ? "___-d22e9fe51690274d8262bda", // "_____7-d57fff123224bd679e4213b", // ? 7 "nin_o_oc-1a0c91070942136ba398919"); break; case NFD: case NFKD: instance.assertItemDirs(round, "child_on-1ec93354e47959489d1440d", "child_tw-bca7d461e11f4f3ed12fd0d", "child_th-b7a6e5662f26eb036090308", "leanbh_c-66fe5ac0be4a896280ef09f", "________-97e4b38574769f9d9968fe9", // ? "___-d22e9fe51690274d8262bda", // "_____7-6d2219439eec0df19863ab8", // ? 7 "nin_o_oc-782e3bad2d233732a03f9dd"); break; } for (String name : Arrays.asList("child-one", "child_two", "child three", "leanbh cig", " ?", "", "? 7", "nio ocho")) { checkChild(instance, Normalizer.normalize(name, form)); } }