Example usage for java.util.regex Pattern split

Introduction

In this page you can find the example usage for java.util.regex Pattern split.

Prototype

public String[] split(CharSequence input)

Source Link

Document

Splits the given input sequence around matches of this pattern.

Usage

From source file:org.apache.ctakes.ytex.uima.annotators.NegexAnnotator.java

private void checkNegation2(JCas aJCas, Sentence s, IdentifiedAnnotation ne, boolean negPoss) {
    // Sorter s = new Sorter();
    String sToReturn = "";
    String sScope = "";
    // String sentencePortion = "";
    // ArrayList sortedRules = null;

    String filler = "_";
    // boolean negationScope = true;

    // Sort the rules by length in descending order.
    // Rules need to be sorted so the longest rule is always tried to match
    // first./*from   w  w w  .ja  v a2s. co m*/
    // Some of the rules overlap so without sorting first shorter rules
    // (some of them POSSIBLE or PSEUDO)
    // would match before longer legitimate negation rules.
    //

    // There is efficiency issue here. It is better if rules are sorted by
    // the
    // calling program once and used without sorting in GennegEx.
    // sortedRules = this.rules;

    // Process the sentence and tag each matched negation
    // rule with correct negation rule tag.
    //
    // At the same time check for the phrase that we want to decide
    // the negation status for and
    // tag the phrase with [PHRASE] ... [PHRASE]
    // In both the negation rules and in the phrase replace white space
    // with "filler" string. (This could cause problems if the sentences
    // we study has "filler" on their own.)

    // Sentence needs one character in the beginning and end to match.
    // We remove the extra characters after processing.
    // vng String sentence = "." + sentenceString + ".";
    String sentence = "." + s.getCoveredText() + ".";

    // Tag the phrases we want to detect for negation.
    // Should happen before rule detection.
    // vng String phrase = phraseString;
    String phrase = ne.getCoveredText();
    Pattern pph = Pattern.compile(phrase.trim(), Pattern.CASE_INSENSITIVE);
    Matcher mph = pph.matcher(sentence);
    CharBuffer buf = CharBuffer.wrap(sentence.toCharArray());

    while (mph.find() == true) {
        sentence = mph.replaceAll(" [PHRASE]" + mph.group().trim().replaceAll(" ", filler) + "[PHRASE]");
    }

    for (NegexRule rule : this.listNegexRules) {
        Matcher m = rule.getPattern().matcher(sentence);
        while (m.find() == true) {
            sentence = m.replaceAll(
                    " " + rule.getTag() + m.group().trim().replaceAll(" ", filler) + rule.getTag() + " ");
        }
    }

    // Exchange the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED]
    // based of PREN, POST rules and if flag is set to true
    // then based on PREP and POSP, as well.

    // Because PRENEGATION [PREN} is checked first it takes precedent over
    // POSTNEGATION [POST].
    // Similarly POSTNEGATION [POST] takes precedent over POSSIBLE
    // PRENEGATION [PREP]
    // and [PREP] takes precedent over POSSIBLE POSTNEGATION [POSP].

    Pattern pSpace = Pattern.compile("[\\s+]");
    String[] sentenceTokens = pSpace.split(sentence);
    StringBuilder sb = new StringBuilder();

    // Check for [PREN]
    for (int i = 0; i < sentenceTokens.length; i++) {
        sb.append(" " + sentenceTokens[i].trim());
        if (sentenceTokens[i].trim().startsWith("[PREN]")) {

            for (int j = i + 1; j < sentenceTokens.length; j++) {
                if (sentenceTokens[j].trim().startsWith("[CONJ]")
                        || sentenceTokens[j].trim().startsWith("[PSEU]")
                        || sentenceTokens[j].trim().startsWith("[POST]")
                        || sentenceTokens[j].trim().startsWith("[PREP]")
                        || sentenceTokens[j].trim().startsWith("[POSP]")) {
                    break;
                }

                if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                    sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]");
                }
            }
        }
    }

    sentence = sb.toString();
    pSpace = Pattern.compile("[\\s+]");
    sentenceTokens = pSpace.split(sentence);
    StringBuilder sb2 = new StringBuilder();

    // Check for [POST]
    for (int i = sentenceTokens.length - 1; i > 0; i--) {
        sb2.insert(0, sentenceTokens[i] + " ");
        if (sentenceTokens[i].trim().startsWith("[POST]")) {
            for (int j = i - 1; j > 0; j--) {
                if (sentenceTokens[j].trim().startsWith("[CONJ]")
                        || sentenceTokens[j].trim().startsWith("[PSEU]")
                        || sentenceTokens[j].trim().startsWith("[PREN]")
                        || sentenceTokens[j].trim().startsWith("[PREP]")
                        || sentenceTokens[j].trim().startsWith("[POSP]")) {
                    break;
                }

                if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                    sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]");
                }
            }
        }
    }
    sentence = sb2.toString();

    // If POSSIBLE negation is detected as negation.
    // negatePossible being set to "true" then check for [PREP] and [POSP].
    if (negPoss == true) {
        pSpace = Pattern.compile("[\\s+]");
        sentenceTokens = pSpace.split(sentence);

        StringBuilder sb3 = new StringBuilder();

        // Check for [PREP]
        for (int i = 0; i < sentenceTokens.length; i++) {
            sb3.append(" " + sentenceTokens[i].trim());
            if (sentenceTokens[i].trim().startsWith("[PREP]")) {

                for (int j = i + 1; j < sentenceTokens.length; j++) {
                    if (sentenceTokens[j].trim().startsWith("[CONJ]")
                            || sentenceTokens[j].trim().startsWith("[PSEU]")
                            || sentenceTokens[j].trim().startsWith("[POST]")
                            || sentenceTokens[j].trim().startsWith("[PREN]")
                            || sentenceTokens[j].trim().startsWith("[POSP]")) {
                        break;
                    }

                    if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                        sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
                    }
                }
            }
        }
        sentence = sb3.toString();
        pSpace = Pattern.compile("[\\s+]");
        sentenceTokens = pSpace.split(sentence);
        StringBuilder sb4 = new StringBuilder();

        // Check for [POSP]
        for (int i = sentenceTokens.length - 1; i > 0; i--) {
            sb4.insert(0, sentenceTokens[i] + " ");
            if (sentenceTokens[i].trim().startsWith("[POSP]")) {
                for (int j = i - 1; j > 0; j--) {
                    if (sentenceTokens[j].trim().startsWith("[CONJ]")
                            || sentenceTokens[j].trim().startsWith("[PSEU]")
                            || sentenceTokens[j].trim().startsWith("[PREN]")
                            || sentenceTokens[j].trim().startsWith("[PREP]")
                            || sentenceTokens[j].trim().startsWith("[POST]")) {
                        break;
                    }

                    if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                        sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
                    }
                }
            }
        }
        sentence = sb4.toString();
    }

    // Remove the filler character we used.
    sentence = sentence.replaceAll(filler, " ");

    // Remove the extra periods at the beginning
    // and end of the sentence.
    sentence = sentence.substring(0, sentence.trim().lastIndexOf('.'));
    sentence = sentence.replaceFirst(".", "");

    // Get the scope of the negation for PREN and PREP
    if (sentence.contains("[PREN]") || sentence.contains("[PREP]")) {
        int startOffset = sentence.indexOf("[PREN]");
        if (startOffset == -1) {
            startOffset = sentence.indexOf("[PREP]");
        }

        int endOffset = sentence.indexOf("[CONJ]");
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[PSEU]");
        }
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[POST]");
        }
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[POSP]");
        }
        if (endOffset == -1 || endOffset < startOffset) {
            endOffset = sentence.length() - 1;
        }
        sScope = sentence.substring(startOffset, endOffset + 1);
    }

    // Get the scope of the negation for POST and POSP
    if (sentence.contains("[POST]") || sentence.contains("[POSP]")) {
        int endOffset = sentence.lastIndexOf("[POST]");
        if (endOffset == -1) {
            endOffset = sentence.lastIndexOf("[POSP]");
        }

        int startOffset = sentence.lastIndexOf("[CONJ]");
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PSEU]");
        }
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PREN]");
        }
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PREP]");
        }
        if (startOffset == -1) {
            startOffset = 0;
        }
        sScope = sentence.substring(startOffset, endOffset);
    }

    // Classify to: negated/possible/affirmed
    if (sentence.contains("[NEGATED]")) {
        sentence = sentence + "\t" + "negated" + "\t" + sScope;
    } else if (sentence.contains("[POSSIBLE]")) {
        sentence = sentence + "\t" + "possible" + "\t" + sScope;
    } else {
        sentence = sentence + "\t" + "affirmed" + "\t" + sScope;
    }

    sToReturn = sentence;
    System.out.println(sToReturn);
}

From source file:org.soas.solr.update.processor.WhereDifferentUPF.java

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {//from  w w w .ja  v  a  2 s.co m
    return new UpdateRequestProcessor(next) {
        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            Collection c = doc.getFieldValues(tagFieldName);

            if (c != null) {
                Iterator it = c.iterator();

                while (it.hasNext()) {
                    String next = (String) it.next();

                    if (doc.containsKey(posFieldName + "_" + next)
                            && doc.containsKey(compareFieldName + "_" + next)) {
                        String posFieldValue = (String) doc.getFieldValue(posFieldName + "_" + next);
                        String[] pos = posFieldValue.split("\\s+");
                        String compareFieldValue = (String) doc.getFieldValue(compareFieldName + "_" + next);
                        String[] compare = compareFieldValue.split("\\s+");

                        //log.info("posFieldValue = " + posFieldValue);
                        //log.info("compareFieldValue = " + compareFieldValue);

                        if (compare.length == pos.length
                                && !(compare.length == 1 && compare[0].equals(compareFieldValue))) {
                            //Pattern oneTag = Pattern.compile("\\[?([^\\]]+)\\]?");
                            Pattern splitter = Pattern.compile("\\]\\[");
                            StringBuffer sbDiff = new StringBuffer();
                            StringBuffer sbChange = new StringBuffer();
                            for (int i = 0; i < compare.length; i++) {
                                sbDiff.append(pos[i]);
                                String tags = compare[i].substring(compare[i].indexOf('|') + 1);
                                if (tags.charAt(0) == '[') {
                                    tags = tags.substring(1, tags.length() - 1); //strip [ and ]
                                }

                                //Matcher m = oneTag.matcher(tags);
                                //if (m.matches()) {

                                String[] tagList = splitter.split(tags);
                                String posRef = pos[i].substring(pos[i].indexOf('|') + 1);
                                boolean match = false;
                                for (int k = 0; k < tagList.length; k++) {
                                    //String tag = m.group(1); //tags.substring(1, tags.length()-1);
                                    //if (!tag.equals(pos[i].substring(pos[i].indexOf('|')+1))) {
                                    if (tagList[k].equals(posRef)) {
                                        match = true;
                                        break;
                                    }
                                }

                                if (!match) {
                                    sbDiff.append(diffDelim);
                                    sbDiff.append(StringUtils.join(tagList, "~"));
                                }
                                /*
                                    sbDiff.append(diffDelim);
                                    sbDiff.append(tag);
                                    sbChange.append(pos[i].substring(0, pos[i].indexOf('|')));
                                    sbChange.append(diffDelim);
                                    sbChange.append(tag);
                                }
                                else {
                                    sbChange.append(pos[i]);
                                }
                                }
                                else {
                                sbChange.append(pos[i]);
                                }
                                */
                                sbDiff.append(' ');
                                sbChange.append(' ');
                            }
                            sbDiff.deleteCharAt(sbDiff.length() - 1); //remove final space  
                            sbChange.deleteCharAt(sbChange.length() - 1); //remove final space

                            if (differentFieldName != null) {
                                SolrInputField differentField = new SolrInputField(
                                        differentFieldName + "_" + next);
                                differentField.setValue(sbDiff.toString(), 1.0f);
                                doc.put(differentFieldName + "_" + next, differentField);
                            }

                            if (changeFieldName != null) {
                                SolrInputField changeField = new SolrInputField(changeFieldName + "_" + next);
                                changeField.setValue(sbChange.toString(), 1.0f);
                                doc.put(changeFieldName + "_" + next, changeField);
                            }
                        }
                    }
                }
            }

            super.processAdd(cmd);
        }
    };
}

From source file:net.duckling.ddl.service.export.impl.ExportServiceImpl.java

private String processFileOrPageLink(String html, String type, VWBContext context, String path,
        ArchiveOutputStream out, Map<String, String> id2Title, List<String> allPages, boolean isEpub) {
    String regex;/*from   w  w w .  j a v  a 2  s  . c  o  m*/
    if (LynxConstants.TYPE_FILE.equals(type)) {
        regex = "/file/([0-9]+)";
    } else {
        regex = "/page/([0-9]+)";
    }
    Pattern p = Pattern.compile(regex);
    String[] cells = p.split(html);
    if (cells.length == 1) {
        return html;
    }
    for (int i = 0; i < cells.length; i++) {
        int indexHref = cells[i].lastIndexOf("href=\"");
        if (i < cells.length - 1 && indexHref > 0) {
            cells[i] = cells[i].substring(0, indexHref);
        }
        int indexQuote = cells[i].indexOf('"');
        if (i > 0) {
            cells[i] = cells[i].substring(indexQuote + 1);
        }
    }
    Matcher m = p.matcher(html);
    StringBuilder sb = new StringBuilder();
    sb.append(cells[0]);
    int index = 1;
    while (m.find()) {
        int attId = Integer.parseInt(m.group(1));
        String resKey = attId + "_" + VWBContext.getCurrentTid() + "_" + type;
        String tagname = path.substring(0, path.lastIndexOf("/"));
        String resPath = getRelativeResPath(resKey, tagname);
        if (null == resPath) {// ???
            if (regex.contains("file")) {
                writeAttFile(path, VWBContext.getCurrentTid(), attId, context, out);
            } else {
                Resource res = resourceService.getResource(attId, context.getTid());
                if (null != res) {
                    List<FileVersion> attFiles = fileVersionService.getFilesOfPage(res.getRid(),
                            VWBContext.getCurrentTid());
                    for (FileVersion file : attFiles) {
                        writeAttFile(path, VWBContext.getCurrentTid(), file.getRid(), context, out);
                    }
                    writePage(path, res.getRid(), context, out, id2Title, allPages, isEpub);
                }
            }
            resPath = getResNoTagPath(resKey);
            resPath = (null == resPath) ? "#" : resPath;
        }
        sb.append("href=\"" + resPath + "\"");
        sb.append(cells[index++]);
    }
    return sb.toString();
}

From source file:org.codehaus.mojo.VeraxxMojo.java

protected OutputStream getOutputStreamErr() {
    String OutputReportName = new String();
    if (reportsfileDir.isAbsolute()) {
        OutputReportName = reportsfileDir.getAbsolutePath() + "/" + getReportFileName();
    } else {/*w w  w .j a  va 2 s .  c  om*/
        OutputReportName = basedir.getAbsolutePath() + "/" + reportsfileDir.getPath() + "/"
                + getReportFileName();
    }
    getLog().info("Vera++ report location " + OutputReportName);

    OutputStream output = System.err;
    File file = new File(OutputReportName);
    try {
        new File(file.getParent()).mkdirs();
        file.createNewFile();
        output = new FileOutputStream(file);
    } catch (IOException e) {
        getLog().error("Vera++ report redirected to stderr since " + OutputReportName + " can't be opened");
        return output;
    }

    final DataOutputStream out = new DataOutputStream(output);

    try {
        out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        out.writeBytes("<checkstyle version=\"5.0\">\n");
    } catch (IOException e) {
        getLog().error("Vera++ xml report write failure");
    }

    OutputStream outErrFilter = new OutputStream() {
        StringBuffer sb = new StringBuffer();

        public void write(int b) throws IOException {
            if ((b == '\n') || (b == '\r')) {
                transformCurrentLine();
                // cleanup for next line
                sb.delete(0, sb.length());
            } else {
                sb.append((char) b);
            }
        }

        public void flush() throws IOException {
            transformCurrentLine();
            getLog().debug("Vera++ xml flush() called");
            if (!StringUtils.isEmpty(lastfile)) {
                out.writeBytes("\t</file>\n");
            }
            out.writeBytes("</checkstyle>\n");
            out.flush();
        }

        String lastfile;

        private void transformCurrentLine() {
            if (sb.length() > 0) {
                // parse current line

                // try to replace ' (RULENumber) ' with 'RULENumber:'
                String p = "^(.+) \\((.+)\\) (.+)$";
                Pattern pattern = Pattern.compile(p);
                Matcher matcher = pattern.matcher(sb);
                getLog().debug("match " + sb + " on " + p);

                boolean bWinPath = false;
                if (sb.charAt(1) == ':') {
                    bWinPath = true;
                    sb.setCharAt(1, '_');
                }

                if (matcher.matches()) {
                    String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3);
                    getLog().debug("rebuild line = " + sLine);

                    // extract informations
                    pattern = Pattern.compile(":");
                    String[] items = pattern.split(sLine);

                    String file, line, rule, comment, severity;
                    file = items.length > 0 ? items[0] : "";
                    line = items.length > 1 ? items[1] : "";
                    rule = items.length > 2 ? items[2] : "";
                    comment = items.length > 3 ? items[3] : "";
                    severity = "warning";

                    if (bWinPath) {
                        StringBuilder s = new StringBuilder(file);
                        s.setCharAt(1, ':');
                        file = s.toString();
                    }

                    // output Xml errors
                    try {
                        // handle <file/> tags
                        if (!file.equals(lastfile)) {
                            if (!StringUtils.isEmpty(lastfile)) {
                                out.writeBytes("\t</file>\n");
                            }
                            out.writeBytes("\t<file name=\"" + file + "\">\n");
                            lastfile = file;
                        }
                        out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity
                                + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n");
                    } catch (IOException e) {
                        getLog().error("Vera++ xml report write failure");
                    }
                }
            }
        }
    };
    return outErrFilter;
}

From source file:org.apache.maven.plugin.cxx.VeraxxMojo.java

@Override
protected OutputStream getOutputStreamErr() {
    String outputReportName = new String();
    if (reportsfileDir.isAbsolute()) {
        outputReportName = reportsfileDir.getAbsolutePath() + File.separator + getReportFileName();
    } else {//from   w  w  w . ja v  a  2s . c  o m
        outputReportName = basedir.getAbsolutePath() + File.separator + reportsfileDir.getPath()
                + File.separator + getReportFileName();
    }
    getLog().info("Vera++ report location " + outputReportName);

    OutputStream output = System.err;
    File file = new File(outputReportName);
    try {
        new File(file.getParent()).mkdirs();
        file.createNewFile();
        output = new FileOutputStream(file);
    } catch (IOException e) {
        getLog().error("Vera++ report redirected to stderr since " + outputReportName + " can't be opened");
        return output;
    }

    final DataOutputStream out = new DataOutputStream(output);

    try {
        out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        out.writeBytes("<checkstyle version=\"5.0\">\n");
    } catch (IOException e) {
        getLog().error("Vera++ xml report write failure");
    }

    OutputStream outErrFilter = new OutputStream() {
        StringBuffer sb = new StringBuffer();

        public void write(int b) throws IOException {
            if ((b == '\n') || (b == '\r')) {
                transformCurrentLine();
                // cleanup for next line
                sb.delete(0, sb.length());
            } else {
                sb.append((char) b);
            }
        }

        public void flush() throws IOException {
            transformCurrentLine();
            getLog().debug("Vera++ xml flush() called");
            if (!StringUtils.isEmpty(lastfile)) {
                out.writeBytes("\t</file>\n");
            }
            out.writeBytes("</checkstyle>\n");
            out.flush();
        }

        String lastfile;

        private void transformCurrentLine() {
            if (sb.length() > 0) {
                // parse current line

                // try to replace ' (RULENumber) ' with 'RULENumber:'
                String p = "^(.+) \\((.+)\\) (.+)$";
                Pattern pattern = Pattern.compile(p);
                Matcher matcher = pattern.matcher(sb);
                getLog().debug("match " + sb + " on " + p);

                boolean bWinPath = false;
                if (sb.charAt(1) == ':') {
                    bWinPath = true;
                    sb.setCharAt(1, '_');
                }

                if (matcher.matches()) {
                    String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3);
                    getLog().debug("rebuild line = " + sLine);

                    // extract informations
                    pattern = Pattern.compile(":");
                    String[] items = pattern.split(sLine);

                    String file, line, rule, comment, severity;
                    file = items.length > 0 ? items[0] : "";
                    line = items.length > 1 ? items[1] : "";
                    rule = items.length > 2 ? items[2] : "";
                    comment = items.length > 3 ? items[3] : "";
                    severity = "warning";

                    if (bWinPath) {
                        StringBuilder s = new StringBuilder(file);
                        s.setCharAt(1, ':');
                        file = s.toString();
                    }

                    // output Xml errors
                    try {
                        // handle <file/> tags
                        if (!file.equals(lastfile)) {
                            if (!StringUtils.isEmpty(lastfile)) {
                                out.writeBytes("\t</file>\n");
                            }
                            out.writeBytes("\t<file name=\"" + file + "\">\n");
                            lastfile = file;
                        }
                        out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity
                                + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n");
                    } catch (IOException e) {
                        getLog().error("Vera++ xml report write failure");
                    }
                }
            }
        }
    };
    return outErrFilter;
}

From source file:com.cyberway.issue.crawler.frontier.AbstractFrontier.java

/**
 * @param name Name of this frontier./* w ww  .ja v  a2s .  co  m*/
 * @param description Description for this frontier.
 */
public AbstractFrontier(String name, String description) {
    super(name, description);
    addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
            "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
            DEFAULT_DELAY_FACTOR));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long.", DEFAULT_MAX_DELAY));
    addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
            "Always wait this long after one completion before recontacting " + "same server.",
            DEFAULT_MIN_DELAY));
    addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS,
            "Respect a Crawl-Delay directive in a site's robots.txt "
                    + "up to this value in seconds. (If longer, simply "
                    + "respect this value.) Default is 300 seconds (5 minutes).",
            DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved. "
                    + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
    addElementToDefinition(
            new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                    + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
    addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
            "Number of embedded (or redirected) hops up to which "
                    + "a URI has higher priority scheduling. For example, if set "
                    + "to 1 (the default), items such as inline images (1-hop "
                    + "embedded resources) will be scheduled ahead of all regular "
                    + "links (or many-hop resources, like nested frames). If set to "
                    + "zero, no preferencing will occur, and embeds/redirects are "
                    + "scheduled the same as regular links.",
            DEFAULT_PREFERENCE_EMBED_HOPS));
    Type t;
    t = addElementToDefinition(new SimpleType(ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
            "The maximum average bandwidth the crawler is allowed to use. "
                    + "The actual read speed is not affected by this setting, it only "
                    + "holds back new URIs from being processed when the bandwidth "
                    + "usage has been to high. 0 means no bandwidth limitation.",
            DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
    t.setOverrideable(false);
    t = addElementToDefinition(new SimpleType(ATTR_MAX_HOST_BANDWIDTH_USAGE,
            "The maximum average bandwidth the crawler is allowed to use per "
                    + "host. The actual read speed is not affected by this setting, "
                    + "it only holds back new URIs from being processed when the "
                    + "bandwidth usage has been to high. 0 means no bandwidth " + "limitation.",
            DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
    t.setExpertSetting(true);

    // Read the list of permissible choices from heritrix.properties.
    // Its a list of space- or comma-separated values.
    String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
            HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                    + BucketQueueAssignmentPolicy.class.getName() + " "
                    + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                    + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
    Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
    String[] queues = p.split(queueStr);
    if (queues.length <= 0) {
        throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
    }
    t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
            "Defines how to assign URIs to queues. Can assign by host, "
                    + "by ip, and into one of a fixed set of buckets (1k).",
            queues[0], queues));
    t.setExpertSetting(true);
    t.setOverrideable(true);

    t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
            "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                    + "per-domain/per-host override to force URIs into "
                    + "a particular named queue, regardless of the assignment "
                    + "policy in effect (domain or ip-based politeness). "
                    + "This could be used on domains known to all be from "
                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                    + "to simulate IP-based politeness, or it could be used if "
                    + "you wanted to enforce politeness over a whole domain, even "
                    + "though the subdomains are split across many IPs.",
            DEFAULT_FORCE_QUEUE));
    t.setOverrideable(true);
    t.setExpertSetting(true);
    t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
            "This field must contain only alphanumeric "
                    + "characters plus period, dash, comma, colon, or underscore."));
    t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_START,
            "Whether to pause when the crawl begins, before any URIs "
                    + "are tried. This gives the operator a chance to verify or "
                    + "adjust the crawl before actual work begins. " + "Default is false.",
            DEFAULT_PAUSE_AT_START));
    t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_FINISH,
            "Whether to pause when the crawl appears finished, rather "
                    + "than immediately end the crawl. This gives the operator an "
                    + "opportunity to view crawl results, and possibly add URIs or "
                    + "adjust settings, while the crawl state is still available. " + "Default is false.",
            DEFAULT_PAUSE_AT_FINISH));
    t.setOverrideable(false);

    t = addElementToDefinition(new SimpleType(ATTR_SOURCE_TAG_SEEDS,
            "Whether to tag seeds with their own URI as a heritable "
                    + "'source' String, which will be carried-forward to all URIs "
                    + "discovered on paths originating from that seed. When "
                    + "present, such source tags appear in the second-to-last " + "crawl.log field.",
            DEFAULT_SOURCE_TAG_SEEDS));
    t.setOverrideable(false);

    t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
            "Set to false to disable recovery log writing.  Do this if "
                    + "you you are using the checkpoint feature for recovering " + "crashed crawls.",
            DEFAULT_ATTR_RECOVERY_ENABLED));
    t.setExpertSetting(true);
    // No sense in it being overrideable.
    t.setOverrideable(false);
}

From source file:com.cyberway.issue.crawler.frontier.AdaptiveRevisitFrontier.java

public AdaptiveRevisitFrontier(String name, String description) {
    super(Frontier.ATTR_NAME, description);
    addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
            "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
            DEFAULT_DELAY_FACTOR));/*from  w  w  w  . jav a 2s.  c  o  m*/
    addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
            "Never wait more than this long, regardless of multiple", DEFAULT_MAX_DELAY));
    addElementToDefinition(
            new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting "
                    + "same server, regardless of multiple", DEFAULT_MIN_DELAY));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved.\n"
                    + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
    addElementToDefinition(
            new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                    + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
    addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
            "Number of embedded (or redirected) hops up to which "
                    + "a URI has higher priority scheduling. For example, if set "
                    + "to 1 (the default), items such as inline images (1-hop "
                    + "embedded resources) will be scheduled ahead of all regular "
                    + "links (or many-hop resources, like nested frames). If set to "
                    + "zero, no preferencing will occur, and embeds/redirects are "
                    + "scheduled the same as regular links.",
            DEFAULT_PREFERENCE_EMBED_HOPS));
    Type t;
    t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
            "Maximum number of simultaneous requests to a single" + " host.", DEFAULT_HOST_VALENCE));
    t.setExpertSetting(true);
    t = addElementToDefinition(
            new SimpleType(ATTR_QUEUE_IGNORE_WWW, "If true then documents from x.com, www.x.com and any "
                    + "www[0-9]+.x.com will be assigned to the same queue.", DEFAULT_QUEUE_IGNORE_WWW));
    t.setExpertSetting(true);
    t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
            "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                    + "per-domain/per-host override to force URIs into "
                    + "a particular named queue, regardless of the assignment "
                    + "policy in effect (domain or ip-based politeness). "
                    + "This could be used on domains known to all be from "
                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                    + "to simulate IP-based politeness, or it could be used if "
                    + "you wanted to enforce politeness over a whole domain, even "
                    + "though the subdomains are split across many IPs.",
            DEFAULT_FORCE_QUEUE));
    t.setOverrideable(true);
    t.setExpertSetting(true);
    t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
            "This field must contain only alphanumeric "
                    + "characters plus period, dash, comma, colon, or underscore."));
    t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
            "If true then the Frontier will use a seperate "
                    + "datastructure to detect and eliminate duplicates.\n"
                    + "This is required for Canonicalization rules to work.",
            DEFAULT_USE_URI_UNIQ_FILTER));
    t.setExpertSetting(true);
    t.setOverrideable(false);
    // Read the list of permissible choices from heritrix.properties.
    // Its a list of space- or comma-separated values.
    String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
            HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                    + BucketQueueAssignmentPolicy.class.getName() + " "
                    + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                    + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
    Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
    String[] queues = p.split(queueStr);
    if (queues.length <= 0) {
        throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
    }
    t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
            "Defines how to assign URIs to queues. Can assign by host, "
                    + "by ip, and into one of a fixed set of buckets (1k). NOTE: "
                    + "Use of policies other than the default "
                    + "HostnameQueueAssignmentPolicy is untested and provided "
                    + "for use at your own risk. Further, changing this policy "
                    + "during a crawl, or between restarts using the same data "
                    + "directory, is likely to cause unrecoverable problems.",
            DEFAULT_QUEUE_ASSIGNMENT_POLICY, queues));
    t.setExpertSetting(true);

    // Register persistent CrawlURI items 
    CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
    CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
}

From source file:edu.cornell.mannlib.vitro.webapp.utils.jena.JenaIngestUtils.java

/**
 * Splits values for a given data property URI on a supplied regex and 
 * asserts each value using newPropertyURI.  New statements returned in
 * a Jena Model.  Split values may be optionally trim()ed.
 * @param inModel//from w  w  w .  ja v  a  2 s.com
 * @param propertyURI
 * @param splitRegex
 * @param newPropertyURI
 * @param trim
 * @return outModel
 */
public Model splitPropertyValues(Model inModel, String propertyURI, String splitRegex, String newPropertyURI,
        boolean trim) {
    Model outModel = ModelFactory.createDefaultModel();
    Pattern delimiterPattern = Pattern.compile(splitRegex);
    Property theProp = ResourceFactory.createProperty(propertyURI);
    Property newProp = ResourceFactory.createProperty(newPropertyURI);
    inModel.enterCriticalSection(Lock.READ);
    try {
        StmtIterator stmtIt = inModel.listStatements((Resource) null, theProp, (RDFNode) null);
        try {
            while (stmtIt.hasNext()) {
                Statement stmt = stmtIt.nextStatement();
                Resource subj = stmt.getSubject();
                RDFNode obj = stmt.getObject();
                if (obj.isLiteral()) {
                    Literal lit = (Literal) obj;
                    String unsplitStr = lit.getLexicalForm();
                    String[] splitPieces = delimiterPattern.split(unsplitStr);
                    for (int i = 0; i < splitPieces.length; i++) {
                        String newLexicalForm = splitPieces[i];
                        if (trim) {
                            newLexicalForm = newLexicalForm.trim();
                        }
                        if (newLexicalForm.length() > 0) {
                            Literal newLiteral = null;
                            if (lit.getDatatype() != null) {
                                newLiteral = outModel.createTypedLiteral(newLexicalForm, lit.getDatatype());
                            } else {
                                if (lit.getLanguage() != null) {
                                    newLiteral = outModel.createLiteral(newLexicalForm, lit.getLanguage());
                                } else {
                                    newLiteral = outModel.createLiteral(newLexicalForm);
                                }
                            }
                            outModel.add(subj, newProp, newLiteral);
                        }
                    }
                }
            }
        } finally {
            stmtIt.close();
        }
    } finally {
        inModel.leaveCriticalSection();
    }
    return outModel;
}

From source file:com.krawler.formbuilder.servlet.workflowHandler.java

private void writeXml(JSONObject jsonobj, String containerId, String processId, JSONObject linejson)
        throws TransformerConfigurationException, TransformerException, JSONException {
    JSONArray jarr = jsonobj.getJSONArray("data");
    JSONArray linearr = linejson.getJSONArray("data");
    String split = ":";
    for (int i = 0; i < jarr.length(); i++) {

        Pattern p = Pattern.compile(split);

        JSONObject jobj = jarr.getJSONObject(i);
        String id = jobj.getString("id");
        String value = jobj.getString("value");
        String[] ObjectVal = null;

        ObjectVal = p.split(value);
        if (ObjectVal[0].equals("process-swim")) {
            ObjectInfo obj = new ObjectInfo();
            obj.objId = ObjectVal[1];/*from w w  w .  ja v a 2  s  . c  om*/
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.type = "Pool";
            obj.handId = "";
            obj.refId = ObjectVal[8];
            this.poolContainer.add(obj);
        } else if (ObjectVal[0].equals("lane-swim")) {
            ObjectInfo obj = new ObjectInfo();
            obj.objId = ObjectVal[1];
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.processId = ObjectVal[8];
            obj.refId = ObjectVal[9];
            obj.type = "Lane";
            obj.handId = "";
            this.poolContainer.add(obj);
        } else {
            ObjectInfo obj = new ObjectInfo();
            if (ObjectVal[0].equals("task-activity")) {
                obj.type = "task";
            } else if (ObjectVal[0].equals("start")) {
                obj.type = "start";
                obj.handId = ObjectVal[7];
            } else if (ObjectVal[0].equals("end")) {
                obj.type = "end";
                obj.handId = ObjectVal[7];
            }
            obj.objId = ObjectVal[1];
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.processId = ObjectVal[8];
            obj.refId = ObjectVal[9];
            obj.hasStart = ObjectVal[10];
            obj.hasEnd = ObjectVal[11];
            obj.startRefId = ObjectVal[12];
            obj.endRefId = ObjectVal[13];
            obj.derivationRule = ObjectVal[14];
            obj.domEl = ObjectVal[15];
            this.taskContainer.add(obj);
        }
    }

    Element rootElet = dom.createElement("Package");
    rootElet.setAttribute("xmlns", "http://www.wfmc.org/2008/XPDL2.1");
    dom.appendChild(rootElet);
    Element ele = dom.createElement("PackageHeader");
    Element childElement = dom.createElement("XPDLVersion");
    Text text = dom.createTextNode("2.1");
    childElement.appendChild(text);
    ele.appendChild(childElement);
    rootElet.appendChild(ele);

    addPools(rootElet, containerId, processId);

    addWorkflow(rootElet, processId, linearr);

}