Java tutorial
/* Copyright 2014 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.norconex.importer.handler.transformer.impl; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; import javax.xml.stream.XMLStreamException; import org.apache.commons.configuration.HierarchicalConfiguration; import org.apache.commons.configuration.XMLConfiguration; import org.apache.commons.lang3.builder.HashCodeBuilder; import org.apache.commons.lang3.builder.ToStringBuilder; import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter; import com.norconex.importer.doc.ImporterMetadata; import com.norconex.importer.handler.transformer.AbstractStringTransformer; /** * <p>Reduces specified consecutive characters or strings to only one * instance (document content only). * If reducing duplicate words, you usually have to add a space at the * Beginning or end of the word. * </p> * * <p>This class can be used as a pre-parsing (text content-types only) * or post-parsing handlers.</p> * <p> * XML configuration usage: * </p> * <pre> * <transformer class="com.norconex.importer.handler.transformer.impl.ReduceConsecutivesTransformer" * caseSensitive="[false|true]" > * <reduce>(character or string to strip)</reduce> * <!-- multiple reduce tags allowed --> * * <restrictTo caseSensitive="[false|true]" * field="(name of header/metadata field name to match)"> * (regular expression of value to match) * </restrictTo> * <!-- multiple "restrictTo" tags allowed (only one needs to match) --> * </transformer> * </pre> * You can specify these special characters in your XML: * <ul> * <li>\r (carriage returns)</li> * <li>\n (line feed)</li> * <li>\t (tab)</li> * <li>\s (space)</li> * </ul> * @author Pascal Essiembre * @since 1.2.0 */ public class ReduceConsecutivesTransformer extends AbstractStringTransformer { private boolean caseSensitive; private final List<String> reductions = new ArrayList<String>(); @Override protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata, boolean parsed, boolean partialContent) { String text = content.toString(); content.setLength(0); Pattern pattern = null; for (String reduction : reductions) { String regex = "(" + escapeRegex(reduction) + ")+"; if (caseSensitive) { pattern = Pattern.compile(regex); } else { pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); } text = pattern.matcher(text).replaceAll("$1"); } content.append(text); } public List<String> getReductions() { return new ArrayList<String>(reductions); } public void setReductions(String... reductions) { this.reductions.clear(); addReductions(reductions); } public void addReductions(String... reductions) { this.reductions.addAll(Arrays.asList(reductions)); } public boolean isCaseSensitive() { return caseSensitive; } /** * Sets whether to ignore case when matching characters or string * to reduce. * @param caseSensitive <code>true</code> to consider character case */ public void setCaseSensitive(boolean caseSensitive) { this.caseSensitive = caseSensitive; } private String escapeRegex(String text) { return text.replaceAll("([\\\\\\.\\[\\{\\(\\*\\+\\?\\^\\$\\|])", "\\\\$1"); } @Override protected void loadHandlerFromXML(XMLConfiguration xml) throws IOException { setCaseSensitive(xml.getBoolean("[@caseSensitive]", false)); List<HierarchicalConfiguration> nodes = xml.configurationsAt("reduce"); for (HierarchicalConfiguration node : nodes) { String text = node.getString(""); text = text.replaceAll("\\\\s", " "); text = text.replaceAll("\\\\t", "\t"); text = text.replaceAll("\\\\n", "\n"); text = text.replaceAll("\\\\r", "\r"); addReductions(text); } } @Override protected void saveHandlerToXML(EnhancedXMLStreamWriter writer) throws XMLStreamException { writer.writeAttribute("caseSensitive", Boolean.toString(isCaseSensitive())); for (String reduction : reductions) { if (reduction != null) { writer.writeStartElement("reduce"); String text = reduction; text = text.replaceAll(" ", "\\\\s"); text = text.replaceAll("\t", "\\\\t"); text = text.replaceAll("\n", "\\\\n"); text = text.replaceAll("\r", "\\\\r"); writer.writeCharacters(text); writer.writeEndElement(); } } } @Override public int hashCode() { return new HashCodeBuilder().appendSuper(super.hashCode()).append(caseSensitive).append(reductions) .toHashCode(); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!super.equals(obj)) { return false; } if (getClass() != obj.getClass()) { return false; } ReduceConsecutivesTransformer other = (ReduceConsecutivesTransformer) obj; if (caseSensitive != other.caseSensitive) { return false; } if (reductions == null) { if (other.reductions != null) { return false; } } else if (!reductions.equals(other.reductions)) { return false; } return true; } @Override public String toString() { return new ToStringBuilder(this).appendSuper(super.toString()).append("caseSensitive", caseSensitive) .append("reductions", reductions).toString(); } }