gov.nyc.doitt.gis.geoclient.parser.test.ChunkSpecParser.java Source code

Java tutorial

Introduction

Here is the source code for gov.nyc.doitt.gis.geoclient.parser.test.ChunkSpecParser.java

Source

/*
 * Copyright 2013-2016 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gov.nyc.doitt.gis.geoclient.parser.test;

import gov.nyc.doitt.gis.geoclient.parser.token.Chunk;
import gov.nyc.doitt.gis.geoclient.parser.token.ChunkType;
import gov.nyc.doitt.gis.geoclient.parser.token.Token;
import gov.nyc.doitt.gis.geoclient.parser.token.TokenType;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;

public class ChunkSpecParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(ChunkSpecParser.class);
    private static final Pattern TOKEN_TYPE_OCCURRENCE_PATTERN = Pattern.compile("^(\\w+(?:_\\w+)?)\\{(.*)\\}$");
    private static final Pattern TOKEN_VALUE_PATTERN = Pattern.compile("\\[([^\\]]+)\\]");
    private static final Pattern CHUNK_DEFINITION_BODY_PATTERN = Pattern.compile("^\\w+(?:_\\w+)?\\((.*)\\)$");

    public ChunkSpec parse(String id, String expectationText) throws TestConfigurationException {
        if (StringUtils.isEmpty(id)) {
            throw new TestConfigurationException("Parameter 'id' cannot be null or empty.");
        }
        if (StringUtils.isEmpty(expectationText)) {
            throw new TestConfigurationException("Parameter 'unparsedString' cannot be null or empty.");
        }
        String[] splits = expectationText.split("=");

        if (splits.length == 1) {
            return createDefaultChunkSpec(id, splits[0]);
        }

        if (splits.length != 2) {
            throw new TestConfigurationException(String.format(
                    "Parameter 'unparsedString(\"%s\")' cannot be split into exactly two strings on token '='.",
                    expectationText));
        }
        List<MutableToken> untypedTokens = parseBracketValues(splits[0]);
        ChunkSpec chunkSpec = new ChunkSpec(id, splits[0], parseChunkSpec(splits[1], splits[0], untypedTokens));
        return chunkSpec;
    }

    protected ChunkSpec createDefaultChunkSpec(String id, String delimitedTokenText) {
        return new ChunkSpec(id, delimitedTokenText, Arrays
                .asList(new Chunk[] { new Chunk(ChunkType.ORIGINAL_INPUT, removeBrackets(delimitedTokenText)) }));
    }

    protected String removeBrackets(String delimitedTokenText) {
        return delimitedTokenText.replaceAll("[\\[\\]]", "");
    }

    protected List<MutableToken> parseBracketValues(String delimitedString) {
        List<MutableToken> result = new ArrayList<>();
        Matcher matcher = TOKEN_VALUE_PATTERN.matcher(delimitedString);
        int previousBracketOffset = 0;
        while (matcher.find()) {
            String value = matcher.group(1);
            int start = matcher.start(1) - previousBracketOffset - 1; // -1 for opening "[" of this group
            int end = start + value.length();
            result.add(new MutableToken(value, start, end));
            previousBracketOffset = previousBracketOffset + 2;

        }
        return result;
    }

    /**
     * Parse List of Chunks from pipe-delimited chunk definitions 
     * @param delimitedChunkDefinitions example: 'COUNTY(0-24:BOROUGH_NAME)|ADDRESS(0-19:HOUSE_NUMBER,STREET_NAME)'
     * @return list of zero or more Chunks 
     */
    protected List<Chunk> parseChunkSpec(String delimitedChunkDefinitions, String delimitedTokenText,
            List<MutableToken> untypedTokens) {
        List<Chunk> chunks = new ArrayList<>();
        String[] chunkDefinitions = delimitedChunkDefinitions.split("\\|");
        for (int i = 0; i < chunkDefinitions.length; i++) {
            chunks.add(parseChunk(chunkDefinitions[i], delimitedTokenText, untypedTokens));
        }
        return chunks;
    }

    /**
     * Parse ChunkType from single chunk definition
     * @param chunkDefinition example: 'ADDRESS(0-19:HOUSE_NUMBER{1},STREET_NAME{2})'
     * @return
     */
    protected ChunkType parseChunkType(String chunkDefinition) {
        if (StringUtils.isEmpty(chunkDefinition)) {
            throw new TestConfigurationException("Parameter 'unparsedChunkSpec' cannot be null or empty.");
        }
        // Example: 'COUNTY(0-24,BOROUGH_NAME{3})'
        String chunkTypeString = chunkDefinition.substring(0, chunkDefinition.indexOf("("));
        ChunkType[] actualTypes = ChunkType.values();
        for (int i = 0; i < actualTypes.length; i++) {
            ChunkType chunkType = actualTypes[i];
            if (chunkType.toString().startsWith(chunkTypeString)) {
                return chunkType;
            }
        }
        throw new TestConfigurationException(String.format("Unknown ChunkType.%s specified.", chunkTypeString));
    }

    /**
     * Given the spec body:
     *  '[59] [Maiden Lane] [Manhattan]=COUNTY(0-24:BOROUGH_NAME{3})|ADDRESS(0-19:HOUSE_NUMBER{1},STREET_NAME{2})'
     * Expects the following parameter tokenType format
     * @param chunkDefinition example: 'ADDRESS(0-19:HOUSE_NUMBER{1},STREET_NAME{2})'
     * @param delimitedTokenText example: '[59] [Maiden Lane] [Manhattan]'
     * @return
     */
    protected Chunk parseChunk(String chunkDefinition, String delimitedTokenText,
            List<MutableToken> untypedTokens) {
        if (StringUtils.isEmpty(chunkDefinition)) {
            throw new TestConfigurationException("Parameter 'unparsedChunkSpec' cannot be null or empty.");
        }
        if (StringUtils.isEmpty(delimitedTokenText)) {
            throw new TestConfigurationException("Parameter 'delimitedTokenText' cannot be null or empty.");
        }
        Matcher matcher = CHUNK_DEFINITION_BODY_PATTERN.matcher(chunkDefinition);
        if (!matcher.matches()) {
            throw new TestConfigurationException(
                    String.format("Could not parse chunk definition from input '%s' using pattern '%s'",
                            chunkDefinition, CHUNK_DEFINITION_BODY_PATTERN.pattern()));
        }
        String chunkDefinitionBody = matcher.group(1);
        if (chunkDefinitionBody == null) {
            throw new TestConfigurationException(
                    String.format("Did not find chunk definition body in input '%s'", chunkDefinition));
        }
        // Must exist:  splits[0] == '0-24'
        // Might exist: splits[1] == 'HOUSE_NUMBER,STREET_NAME'
        String[] splits = chunkDefinitionBody.split(":");
        int splitsLength = splits.length;
        if (splitsLength < 1 || splitsLength > 2) {
            throw new TestConfigurationException(String.format(
                    "Expected format '2-12:TYPE1{2},TYPE2{1}' for chunk definition body but found '%s'.",
                    chunkDefinitionBody));
        }
        LOGGER.debug("Parsing Chunk definition '{}'", chunkDefinition);
        ChunkType chunkType = parseChunkType(chunkDefinition);
        String plainText = removeBrackets(delimitedTokenText);
        Range chunkRange = parseChunkRange(splits[0]);
        Chunk chunk = new Chunk(chunkType, chunkRange.substring(plainText));
        if (splitsLength == 1) {
            // Chunk definition does not contain tokens
            return chunk;
        }
        String[] tokenTypeNameAndPositions = splits[1].split(",");
        for (int i = 0; i < tokenTypeNameAndPositions.length; i++) {
            TokenTypeOccurrence typeOccurrence = parseTokenTypeOccurrence(tokenTypeNameAndPositions[i]);
            if (typeOccurrence.occurrence() > untypedTokens.size()) {
                throw new TestConfigurationException(
                        String.format("TokenType specified for value %d but there are only %d Token values.",
                                typeOccurrence.occurrence(), untypedTokens.size()));
            }
            MutableToken untypedToken = untypedTokens.get(typeOccurrence.zeroIndexOccurrence());
            untypedToken.setTokenType(typeOccurrence.tokenType());
            chunk.add(untypedToken.toToken());
        }
        return chunk;
    }

    protected TokenTypeOccurrence parseTokenTypeOccurrence(String tokenTypeNameAndPosition) {
        Matcher matcher = TOKEN_TYPE_OCCURRENCE_PATTERN.matcher(tokenTypeNameAndPosition);
        if (!matcher.matches()) {
            throw new TestConfigurationException(
                    String.format("Could not parse TokenTypeOccurrence from input '%s'", tokenTypeNameAndPosition));
        }
        return new TokenTypeOccurrence(TokenType.valueOf(matcher.group(1)), Integer.valueOf(matcher.group(2)));
    }

    protected ChunkSpecParser.Range parseChunkRange(String string) {
        if (string == null) {
            throw new TestConfigurationException("Chunk range string cannot be null or empty");
        }
        String[] splits = string.split("-");
        if (splits.length != 2) {
            throw new TestConfigurationException(
                    String.format("Expected format '2-12' for chunk range but was '%s'.", string));
        }
        return new Range(Integer.parseInt(splits[0]), Integer.parseInt(splits[1]));
    }

    public static class MutableToken {
        private TokenType tokenType;
        private String value;
        private int start;
        private int end;

        public MutableToken(String value, int start, int end) {
            super();
            this.value = value;
            this.start = start;
            this.end = end;
        }

        public TokenType getTokenType() {
            return tokenType;
        }

        public void setTokenType(TokenType tokenType) {
            this.tokenType = tokenType;
        }

        public String getValue() {
            return value;
        }

        public void setValue(String value) {
            this.value = value;
        }

        public int getStart() {
            return start;
        }

        public void setStart(int start) {
            this.start = start;
        }

        public int getEnd() {
            return end;
        }

        public void setEnd(int end) {
            this.end = end;
        }

        public Token toToken() {
            return new Token(tokenType, value, start, end);
        }

    }

    public static class Range {
        private final int start;
        private final int end;

        public Range(int start, int end) {
            super();
            this.start = start;
            this.end = end;
        }

        public int getStart() {
            return start;
        }

        public int getEnd() {
            return end;
        }

        public String substring(String string) {
            if (string.length() < start || string.length() < end) {
                throw new TestConfigurationException(String.format(
                        "substring(%d,%d) specifies invalid index(s) for \"%s\" which is only %d characters long",
                        start, end, string, string.length()));
            }
            return string.substring(start, end);
        }
    }

    public static class TokenTypeOccurrence {
        private final TokenType tokenType;
        private final int occurrence;

        public TokenTypeOccurrence(TokenType tokenType, int occurrence) {
            super();
            this.tokenType = tokenType;
            this.occurrence = occurrence;
        }

        public TokenType tokenType() {
            return tokenType;
        }

        public int occurrence() {
            return occurrence;
        }

        public int zeroIndexOccurrence() {
            return occurrence - 1;
        }

    }
}