Example usage for java.util.regex Pattern split

List of usage examples for java.util.regex Pattern split

Introduction

In this page you can find the example usage for java.util.regex Pattern split.

Prototype

public String[] split(CharSequence input) 

Source Link

Document

Splits the given input sequence around matches of this pattern.

Usage

From source file:org.apache.ctakes.ytex.uima.annotators.NegexAnnotator.java

private void checkNegation2(JCas aJCas, Sentence s, IdentifiedAnnotation ne, boolean negPoss) {
    // Sorter s = new Sorter();
    String sToReturn = "";
    String sScope = "";
    // String sentencePortion = "";
    // ArrayList sortedRules = null;

    String filler = "_";
    // boolean negationScope = true;

    // Sort the rules by length in descending order.
    // Rules need to be sorted so the longest rule is always tried to match
    // first./*from   w  w w  .ja  v a2s. co m*/
    // Some of the rules overlap so without sorting first shorter rules
    // (some of them POSSIBLE or PSEUDO)
    // would match before longer legitimate negation rules.
    //

    // There is efficiency issue here. It is better if rules are sorted by
    // the
    // calling program once and used without sorting in GennegEx.
    // sortedRules = this.rules;

    // Process the sentence and tag each matched negation
    // rule with correct negation rule tag.
    //
    // At the same time check for the phrase that we want to decide
    // the negation status for and
    // tag the phrase with [PHRASE] ... [PHRASE]
    // In both the negation rules and in the phrase replace white space
    // with "filler" string. (This could cause problems if the sentences
    // we study has "filler" on their own.)

    // Sentence needs one character in the beginning and end to match.
    // We remove the extra characters after processing.
    // vng String sentence = "." + sentenceString + ".";
    String sentence = "." + s.getCoveredText() + ".";

    // Tag the phrases we want to detect for negation.
    // Should happen before rule detection.
    // vng String phrase = phraseString;
    String phrase = ne.getCoveredText();
    Pattern pph = Pattern.compile(phrase.trim(), Pattern.CASE_INSENSITIVE);
    Matcher mph = pph.matcher(sentence);
    CharBuffer buf = CharBuffer.wrap(sentence.toCharArray());

    while (mph.find() == true) {
        sentence = mph.replaceAll(" [PHRASE]" + mph.group().trim().replaceAll(" ", filler) + "[PHRASE]");
    }

    for (NegexRule rule : this.listNegexRules) {
        Matcher m = rule.getPattern().matcher(sentence);
        while (m.find() == true) {
            sentence = m.replaceAll(
                    " " + rule.getTag() + m.group().trim().replaceAll(" ", filler) + rule.getTag() + " ");
        }
    }

    // Exchange the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED]
    // based of PREN, POST rules and if flag is set to true
    // then based on PREP and POSP, as well.

    // Because PRENEGATION [PREN} is checked first it takes precedent over
    // POSTNEGATION [POST].
    // Similarly POSTNEGATION [POST] takes precedent over POSSIBLE
    // PRENEGATION [PREP]
    // and [PREP] takes precedent over POSSIBLE POSTNEGATION [POSP].

    Pattern pSpace = Pattern.compile("[\\s+]");
    String[] sentenceTokens = pSpace.split(sentence);
    StringBuilder sb = new StringBuilder();

    // Check for [PREN]
    for (int i = 0; i < sentenceTokens.length; i++) {
        sb.append(" " + sentenceTokens[i].trim());
        if (sentenceTokens[i].trim().startsWith("[PREN]")) {

            for (int j = i + 1; j < sentenceTokens.length; j++) {
                if (sentenceTokens[j].trim().startsWith("[CONJ]")
                        || sentenceTokens[j].trim().startsWith("[PSEU]")
                        || sentenceTokens[j].trim().startsWith("[POST]")
                        || sentenceTokens[j].trim().startsWith("[PREP]")
                        || sentenceTokens[j].trim().startsWith("[POSP]")) {
                    break;
                }

                if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                    sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]");
                }
            }
        }
    }

    sentence = sb.toString();
    pSpace = Pattern.compile("[\\s+]");
    sentenceTokens = pSpace.split(sentence);
    StringBuilder sb2 = new StringBuilder();

    // Check for [POST]
    for (int i = sentenceTokens.length - 1; i > 0; i--) {
        sb2.insert(0, sentenceTokens[i] + " ");
        if (sentenceTokens[i].trim().startsWith("[POST]")) {
            for (int j = i - 1; j > 0; j--) {
                if (sentenceTokens[j].trim().startsWith("[CONJ]")
                        || sentenceTokens[j].trim().startsWith("[PSEU]")
                        || sentenceTokens[j].trim().startsWith("[PREN]")
                        || sentenceTokens[j].trim().startsWith("[PREP]")
                        || sentenceTokens[j].trim().startsWith("[POSP]")) {
                    break;
                }

                if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                    sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]");
                }
            }
        }
    }
    sentence = sb2.toString();

    // If POSSIBLE negation is detected as negation.
    // negatePossible being set to "true" then check for [PREP] and [POSP].
    if (negPoss == true) {
        pSpace = Pattern.compile("[\\s+]");
        sentenceTokens = pSpace.split(sentence);

        StringBuilder sb3 = new StringBuilder();

        // Check for [PREP]
        for (int i = 0; i < sentenceTokens.length; i++) {
            sb3.append(" " + sentenceTokens[i].trim());
            if (sentenceTokens[i].trim().startsWith("[PREP]")) {

                for (int j = i + 1; j < sentenceTokens.length; j++) {
                    if (sentenceTokens[j].trim().startsWith("[CONJ]")
                            || sentenceTokens[j].trim().startsWith("[PSEU]")
                            || sentenceTokens[j].trim().startsWith("[POST]")
                            || sentenceTokens[j].trim().startsWith("[PREN]")
                            || sentenceTokens[j].trim().startsWith("[POSP]")) {
                        break;
                    }

                    if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                        sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
                    }
                }
            }
        }
        sentence = sb3.toString();
        pSpace = Pattern.compile("[\\s+]");
        sentenceTokens = pSpace.split(sentence);
        StringBuilder sb4 = new StringBuilder();

        // Check for [POSP]
        for (int i = sentenceTokens.length - 1; i > 0; i--) {
            sb4.insert(0, sentenceTokens[i] + " ");
            if (sentenceTokens[i].trim().startsWith("[POSP]")) {
                for (int j = i - 1; j > 0; j--) {
                    if (sentenceTokens[j].trim().startsWith("[CONJ]")
                            || sentenceTokens[j].trim().startsWith("[PSEU]")
                            || sentenceTokens[j].trim().startsWith("[PREN]")
                            || sentenceTokens[j].trim().startsWith("[PREP]")
                            || sentenceTokens[j].trim().startsWith("[POST]")) {
                        break;
                    }

                    if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
                        sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
                    }
                }
            }
        }
        sentence = sb4.toString();
    }

    // Remove the filler character we used.
    sentence = sentence.replaceAll(filler, " ");

    // Remove the extra periods at the beginning
    // and end of the sentence.
    sentence = sentence.substring(0, sentence.trim().lastIndexOf('.'));
    sentence = sentence.replaceFirst(".", "");

    // Get the scope of the negation for PREN and PREP
    if (sentence.contains("[PREN]") || sentence.contains("[PREP]")) {
        int startOffset = sentence.indexOf("[PREN]");
        if (startOffset == -1) {
            startOffset = sentence.indexOf("[PREP]");
        }

        int endOffset = sentence.indexOf("[CONJ]");
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[PSEU]");
        }
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[POST]");
        }
        if (endOffset == -1) {
            endOffset = sentence.indexOf("[POSP]");
        }
        if (endOffset == -1 || endOffset < startOffset) {
            endOffset = sentence.length() - 1;
        }
        sScope = sentence.substring(startOffset, endOffset + 1);
    }

    // Get the scope of the negation for POST and POSP
    if (sentence.contains("[POST]") || sentence.contains("[POSP]")) {
        int endOffset = sentence.lastIndexOf("[POST]");
        if (endOffset == -1) {
            endOffset = sentence.lastIndexOf("[POSP]");
        }

        int startOffset = sentence.lastIndexOf("[CONJ]");
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PSEU]");
        }
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PREN]");
        }
        if (startOffset == -1) {
            startOffset = sentence.lastIndexOf("[PREP]");
        }
        if (startOffset == -1) {
            startOffset = 0;
        }
        sScope = sentence.substring(startOffset, endOffset);
    }

    // Classify to: negated/possible/affirmed
    if (sentence.contains("[NEGATED]")) {
        sentence = sentence + "\t" + "negated" + "\t" + sScope;
    } else if (sentence.contains("[POSSIBLE]")) {
        sentence = sentence + "\t" + "possible" + "\t" + sScope;
    } else {
        sentence = sentence + "\t" + "affirmed" + "\t" + sScope;
    }

    sToReturn = sentence;
    System.out.println(sToReturn);
}

From source file:org.soas.solr.update.processor.WhereDifferentUPF.java

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {//from  w w w .ja  v  a  2 s.co m
    return new UpdateRequestProcessor(next) {
        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            Collection c = doc.getFieldValues(tagFieldName);

            if (c != null) {
                Iterator it = c.iterator();

                while (it.hasNext()) {
                    String next = (String) it.next();

                    if (doc.containsKey(posFieldName + "_" + next)
                            && doc.containsKey(compareFieldName + "_" + next)) {
                        String posFieldValue = (String) doc.getFieldValue(posFieldName + "_" + next);
                        String[] pos = posFieldValue.split("\\s+");
                        String compareFieldValue = (String) doc.getFieldValue(compareFieldName + "_" + next);
                        String[] compare = compareFieldValue.split("\\s+");

                        //log.info("posFieldValue = " + posFieldValue);
                        //log.info("compareFieldValue = " + compareFieldValue);

                        if (compare.length == pos.length
                                && !(compare.length == 1 && compare[0].equals(compareFieldValue))) {
                            //Pattern oneTag = Pattern.compile("\\[?([^\\]]+)\\]?");
                            Pattern splitter = Pattern.compile("\\]\\[");
                            StringBuffer sbDiff = new StringBuffer();
                            StringBuffer sbChange = new StringBuffer();
                            for (int i = 0; i < compare.length; i++) {
                                sbDiff.append(pos[i]);
                                String tags = compare[i].substring(compare[i].indexOf('|') + 1);
                                if (tags.charAt(0) == '[') {
                                    tags = tags.substring(1, tags.length() - 1); //strip [ and ]
                                }

                                //Matcher m = oneTag.matcher(tags);
                                //if (m.matches()) {

                                String[] tagList = splitter.split(tags);
                                String posRef = pos[i].substring(pos[i].indexOf('|') + 1);
                                boolean match = false;
                                for (int k = 0; k < tagList.length; k++) {
                                    //String tag = m.group(1); //tags.substring(1, tags.length()-1);
                                    //if (!tag.equals(pos[i].substring(pos[i].indexOf('|')+1))) {
                                    if (tagList[k].equals(posRef)) {
                                        match = true;
                                        break;
                                    }
                                }

                                if (!match) {
                                    sbDiff.append(diffDelim);
                                    sbDiff.append(StringUtils.join(tagList, "~"));
                                }
                                /*
                                    sbDiff.append(diffDelim);
                                    sbDiff.append(tag);
                                    sbChange.append(pos[i].substring(0, pos[i].indexOf('|')));
                                    sbChange.append(diffDelim);
                                    sbChange.append(tag);
                                }
                                else {
                                    sbChange.append(pos[i]);
                                }
                                }
                                else {
                                sbChange.append(pos[i]);
                                }
                                */
                                sbDiff.append(' ');
                                sbChange.append(' ');
                            }
                            sbDiff.deleteCharAt(sbDiff.length() - 1); //remove final space  
                            sbChange.deleteCharAt(sbChange.length() - 1); //remove final space

                            if (differentFieldName != null) {
                                SolrInputField differentField = new SolrInputField(
                                        differentFieldName + "_" + next);
                                differentField.setValue(sbDiff.toString(), 1.0f);
                                doc.put(differentFieldName + "_" + next, differentField);
                            }

                            if (changeFieldName != null) {
                                SolrInputField changeField = new SolrInputField(changeFieldName + "_" + next);
                                changeField.setValue(sbChange.toString(), 1.0f);
                                doc.put(changeFieldName + "_" + next, changeField);
                            }
                        }
                    }
                }
            }

            super.processAdd(cmd);
        }
    };
}

From source file:net.duckling.ddl.service.export.impl.ExportServiceImpl.java

private String processFileOrPageLink(String html, String type, VWBContext context, String path,
        ArchiveOutputStream out, Map<String, String> id2Title, List<String> allPages, boolean isEpub) {
    String regex;/*from   w  w w .  j a v  a 2  s  . c  o  m*/
    if (LynxConstants.TYPE_FILE.equals(type)) {
        regex = "/file/([0-9]+)";
    } else {
        regex = "/page/([0-9]+)";
    }
    Pattern p = Pattern.compile(regex);
    String[] cells = p.split(html);
    if (cells.length == 1) {
        return html;
    }
    for (int i = 0; i < cells.length; i++) {
        int indexHref = cells[i].lastIndexOf("href=\"");
        if (i < cells.length - 1 && indexHref > 0) {
            cells[i] = cells[i].substring(0, indexHref);
        }
        int indexQuote = cells[i].indexOf('"');
        if (i > 0) {
            cells[i] = cells[i].substring(indexQuote + 1);
        }
    }
    Matcher m = p.matcher(html);
    StringBuilder sb = new StringBuilder();
    sb.append(cells[0]);
    int index = 1;
    while (m.find()) {
        int attId = Integer.parseInt(m.group(1));
        String resKey = attId + "_" + VWBContext.getCurrentTid() + "_" + type;
        String tagname = path.substring(0, path.lastIndexOf("/"));
        String resPath = getRelativeResPath(resKey, tagname);
        if (null == resPath) {// ???
            if (regex.contains("file")) {
                writeAttFile(path, VWBContext.getCurrentTid(), attId, context, out);
            } else {
                Resource res = resourceService.getResource(attId, context.getTid());
                if (null != res) {
                    List<FileVersion> attFiles = fileVersionService.getFilesOfPage(res.getRid(),
                            VWBContext.getCurrentTid());
                    for (FileVersion file : attFiles) {
                        writeAttFile(path, VWBContext.getCurrentTid(), file.getRid(), context, out);
                    }
                    writePage(path, res.getRid(), context, out, id2Title, allPages, isEpub);
                }
            }
            resPath = getResNoTagPath(resKey);
            resPath = (null == resPath) ? "#" : resPath;
        }
        sb.append("href=\"" + resPath + "\"");
        sb.append(cells[index++]);
    }
    return sb.toString();
}

From source file:org.codehaus.mojo.VeraxxMojo.java

protected OutputStream getOutputStreamErr() {
    String OutputReportName = new String();
    if (reportsfileDir.isAbsolute()) {
        OutputReportName = reportsfileDir.getAbsolutePath() + "/" + getReportFileName();
    } else {/*w w  w .j a  va 2 s .  c  om*/
        OutputReportName = basedir.getAbsolutePath() + "/" + reportsfileDir.getPath() + "/"
                + getReportFileName();
    }
    getLog().info("Vera++ report location " + OutputReportName);

    OutputStream output = System.err;
    File file = new File(OutputReportName);
    try {
        new File(file.getParent()).mkdirs();
        file.createNewFile();
        output = new FileOutputStream(file);
    } catch (IOException e) {
        getLog().error("Vera++ report redirected to stderr since " + OutputReportName + " can't be opened");
        return output;
    }

    final DataOutputStream out = new DataOutputStream(output);

    try {
        out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        out.writeBytes("<checkstyle version=\"5.0\">\n");
    } catch (IOException e) {
        getLog().error("Vera++ xml report write failure");
    }

    OutputStream outErrFilter = new OutputStream() {
        StringBuffer sb = new StringBuffer();

        public void write(int b) throws IOException {
            if ((b == '\n') || (b == '\r')) {
                transformCurrentLine();
                // cleanup for next line
                sb.delete(0, sb.length());
            } else {
                sb.append((char) b);
            }
        }

        public void flush() throws IOException {
            transformCurrentLine();
            getLog().debug("Vera++ xml flush() called");
            if (!StringUtils.isEmpty(lastfile)) {
                out.writeBytes("\t</file>\n");
            }
            out.writeBytes("</checkstyle>\n");
            out.flush();
        }

        String lastfile;

        private void transformCurrentLine() {
            if (sb.length() > 0) {
                // parse current line

                // try to replace ' (RULENumber) ' with 'RULENumber:'
                String p = "^(.+) \\((.+)\\) (.+)$";
                Pattern pattern = Pattern.compile(p);
                Matcher matcher = pattern.matcher(sb);
                getLog().debug("match " + sb + " on " + p);

                boolean bWinPath = false;
                if (sb.charAt(1) == ':') {
                    bWinPath = true;
                    sb.setCharAt(1, '_');
                }

                if (matcher.matches()) {
                    String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3);
                    getLog().debug("rebuild line = " + sLine);

                    // extract informations
                    pattern = Pattern.compile(":");
                    String[] items = pattern.split(sLine);

                    String file, line, rule, comment, severity;
                    file = items.length > 0 ? items[0] : "";
                    line = items.length > 1 ? items[1] : "";
                    rule = items.length > 2 ? items[2] : "";
                    comment = items.length > 3 ? items[3] : "";
                    severity = "warning";

                    if (bWinPath) {
                        StringBuilder s = new StringBuilder(file);
                        s.setCharAt(1, ':');
                        file = s.toString();
                    }

                    // output Xml errors
                    try {
                        // handle <file/> tags
                        if (!file.equals(lastfile)) {
                            if (!StringUtils.isEmpty(lastfile)) {
                                out.writeBytes("\t</file>\n");
                            }
                            out.writeBytes("\t<file name=\"" + file + "\">\n");
                            lastfile = file;
                        }
                        out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity
                                + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n");
                    } catch (IOException e) {
                        getLog().error("Vera++ xml report write failure");
                    }
                }
            }
        }
    };
    return outErrFilter;
}

From source file:org.apache.maven.plugin.cxx.VeraxxMojo.java

@Override
protected OutputStream getOutputStreamErr() {
    String outputReportName = new String();
    if (reportsfileDir.isAbsolute()) {
        outputReportName = reportsfileDir.getAbsolutePath() + File.separator + getReportFileName();
    } else {//from   w  w  w . ja v  a  2s . c  o m
        outputReportName = basedir.getAbsolutePath() + File.separator + reportsfileDir.getPath()
                + File.separator + getReportFileName();
    }
    getLog().info("Vera++ report location " + outputReportName);

    OutputStream output = System.err;
    File file = new File(outputReportName);
    try {
        new File(file.getParent()).mkdirs();
        file.createNewFile();
        output = new FileOutputStream(file);
    } catch (IOException e) {
        getLog().error("Vera++ report redirected to stderr since " + outputReportName + " can't be opened");
        return output;
    }

    final DataOutputStream out = new DataOutputStream(output);

    try {
        out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        out.writeBytes("<checkstyle version=\"5.0\">\n");
    } catch (IOException e) {
        getLog().error("Vera++ xml report write failure");
    }

    OutputStream outErrFilter = new OutputStream() {
        StringBuffer sb = new StringBuffer();

        public void write(int b) throws IOException {
            if ((b == '\n') || (b == '\r')) {
                transformCurrentLine();
                // cleanup for next line
                sb.delete(0, sb.length());
            } else {
                sb.append((char) b);
            }
        }

        public void flush() throws IOException {
            transformCurrentLine();
            getLog().debug("Vera++ xml flush() called");
            if (!StringUtils.isEmpty(lastfile)) {
                out.writeBytes("\t</file>\n");
            }
            out.writeBytes("</checkstyle>\n");
            out.flush();
        }

        String lastfile;

        private void transformCurrentLine() {
            if (sb.length() > 0) {
                // parse current line

                // try to replace ' (RULENumber) ' with 'RULENumber:'
                String p = "^(.+) \\((.+)\\) (.+)$";
                Pattern pattern = Pattern.compile(p);
                Matcher matcher = pattern.matcher(sb);
                getLog().debug("match " + sb + " on " + p);

                boolean bWinPath = false;
                if (sb.charAt(1) == ':') {
                    bWinPath = true;
                    sb.setCharAt(1, '_');
                }

                if (matcher.matches()) {
                    String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3);
                    getLog().debug("rebuild line = " + sLine);

                    // extract informations
                    pattern = Pattern.compile(":");
                    String[] items = pattern.split(sLine);

                    String file, line, rule, comment, severity;
                    file = items.length > 0 ? items[0] : "";
                    line = items.length > 1 ? items[1] : "";
                    rule = items.length > 2 ? items[2] : "";
                    comment = items.length > 3 ? items[3] : "";
                    severity = "warning";

                    if (bWinPath) {
                        StringBuilder s = new StringBuilder(file);
                        s.setCharAt(1, ':');
                        file = s.toString();
                    }

                    // output Xml errors
                    try {
                        // handle <file/> tags
                        if (!file.equals(lastfile)) {
                            if (!StringUtils.isEmpty(lastfile)) {
                                out.writeBytes("\t</file>\n");
                            }
                            out.writeBytes("\t<file name=\"" + file + "\">\n");
                            lastfile = file;
                        }
                        out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity
                                + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n");
                    } catch (IOException e) {
                        getLog().error("Vera++ xml report write failure");
                    }
                }
            }
        }
    };
    return outErrFilter;
}

From source file:com.cyberway.issue.crawler.frontier.AbstractFrontier.java

/**
 * @param name Name of this frontier./* w ww  .ja v  a2s .  co  m*/
 * @param description Description for this frontier.
 */
public AbstractFrontier(String name, String description) {
    super(name, description);
    addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
            "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
            DEFAULT_DELAY_FACTOR));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long.", DEFAULT_MAX_DELAY));
    addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
            "Always wait this long after one completion before recontacting " + "same server.",
            DEFAULT_MIN_DELAY));
    addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS,
            "Respect a Crawl-Delay directive in a site's robots.txt "
                    + "up to this value in seconds. (If longer, simply "
                    + "respect this value.) Default is 300 seconds (5 minutes).",
            DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved. "
                    + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
    addElementToDefinition(
            new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                    + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
    addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
            "Number of embedded (or redirected) hops up to which "
                    + "a URI has higher priority scheduling. For example, if set "
                    + "to 1 (the default), items such as inline images (1-hop "
                    + "embedded resources) will be scheduled ahead of all regular "
                    + "links (or many-hop resources, like nested frames). If set to "
                    + "zero, no preferencing will occur, and embeds/redirects are "
                    + "scheduled the same as regular links.",
            DEFAULT_PREFERENCE_EMBED_HOPS));
    Type t;
    t = addElementToDefinition(new SimpleType(ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
            "The maximum average bandwidth the crawler is allowed to use. "
                    + "The actual read speed is not affected by this setting, it only "
                    + "holds back new URIs from being processed when the bandwidth "
                    + "usage has been to high. 0 means no bandwidth limitation.",
            DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
    t.setOverrideable(false);
    t = addElementToDefinition(new SimpleType(ATTR_MAX_HOST_BANDWIDTH_USAGE,
            "The maximum average bandwidth the crawler is allowed to use per "
                    + "host. The actual read speed is not affected by this setting, "
                    + "it only holds back new URIs from being processed when the "
                    + "bandwidth usage has been to high. 0 means no bandwidth " + "limitation.",
            DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
    t.setExpertSetting(true);

    // Read the list of permissible choices from heritrix.properties.
    // Its a list of space- or comma-separated values.
    String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
            HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                    + BucketQueueAssignmentPolicy.class.getName() + " "
                    + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                    + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
    Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
    String[] queues = p.split(queueStr);
    if (queues.length <= 0) {
        throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
    }
    t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
            "Defines how to assign URIs to queues. Can assign by host, "
                    + "by ip, and into one of a fixed set of buckets (1k).",
            queues[0], queues));
    t.setExpertSetting(true);
    t.setOverrideable(true);

    t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
            "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                    + "per-domain/per-host override to force URIs into "
                    + "a particular named queue, regardless of the assignment "
                    + "policy in effect (domain or ip-based politeness). "
                    + "This could be used on domains known to all be from "
                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                    + "to simulate IP-based politeness, or it could be used if "
                    + "you wanted to enforce politeness over a whole domain, even "
                    + "though the subdomains are split across many IPs.",
            DEFAULT_FORCE_QUEUE));
    t.setOverrideable(true);
    t.setExpertSetting(true);
    t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
            "This field must contain only alphanumeric "
                    + "characters plus period, dash, comma, colon, or underscore."));
    t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_START,
            "Whether to pause when the crawl begins, before any URIs "
                    + "are tried. This gives the operator a chance to verify or "
                    + "adjust the crawl before actual work begins. " + "Default is false.",
            DEFAULT_PAUSE_AT_START));
    t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_FINISH,
            "Whether to pause when the crawl appears finished, rather "
                    + "than immediately end the crawl. This gives the operator an "
                    + "opportunity to view crawl results, and possibly add URIs or "
                    + "adjust settings, while the crawl state is still available. " + "Default is false.",
            DEFAULT_PAUSE_AT_FINISH));
    t.setOverrideable(false);

    t = addElementToDefinition(new SimpleType(ATTR_SOURCE_TAG_SEEDS,
            "Whether to tag seeds with their own URI as a heritable "
                    + "'source' String, which will be carried-forward to all URIs "
                    + "discovered on paths originating from that seed. When "
                    + "present, such source tags appear in the second-to-last " + "crawl.log field.",
            DEFAULT_SOURCE_TAG_SEEDS));
    t.setOverrideable(false);

    t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
            "Set to false to disable recovery log writing.  Do this if "
                    + "you you are using the checkpoint feature for recovering " + "crashed crawls.",
            DEFAULT_ATTR_RECOVERY_ENABLED));
    t.setExpertSetting(true);
    // No sense in it being overrideable.
    t.setOverrideable(false);
}

From source file:com.cyberway.issue.crawler.frontier.AdaptiveRevisitFrontier.java

public AdaptiveRevisitFrontier(String name, String description) {
    super(Frontier.ATTR_NAME, description);
    addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
            "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
            DEFAULT_DELAY_FACTOR));/*from  w  w  w  . jav a 2s.  c  o  m*/
    addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
            "Never wait more than this long, regardless of multiple", DEFAULT_MAX_DELAY));
    addElementToDefinition(
            new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting "
                    + "same server, regardless of multiple", DEFAULT_MIN_DELAY));
    addElementToDefinition(
            new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved.\n"
                    + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
    addElementToDefinition(
            new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                    + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
    addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
            "Number of embedded (or redirected) hops up to which "
                    + "a URI has higher priority scheduling. For example, if set "
                    + "to 1 (the default), items such as inline images (1-hop "
                    + "embedded resources) will be scheduled ahead of all regular "
                    + "links (or many-hop resources, like nested frames). If set to "
                    + "zero, no preferencing will occur, and embeds/redirects are "
                    + "scheduled the same as regular links.",
            DEFAULT_PREFERENCE_EMBED_HOPS));
    Type t;
    t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
            "Maximum number of simultaneous requests to a single" + " host.", DEFAULT_HOST_VALENCE));
    t.setExpertSetting(true);
    t = addElementToDefinition(
            new SimpleType(ATTR_QUEUE_IGNORE_WWW, "If true then documents from x.com, www.x.com and any "
                    + "www[0-9]+.x.com will be assigned to the same queue.", DEFAULT_QUEUE_IGNORE_WWW));
    t.setExpertSetting(true);
    t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
            "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                    + "per-domain/per-host override to force URIs into "
                    + "a particular named queue, regardless of the assignment "
                    + "policy in effect (domain or ip-based politeness). "
                    + "This could be used on domains known to all be from "
                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                    + "to simulate IP-based politeness, or it could be used if "
                    + "you wanted to enforce politeness over a whole domain, even "
                    + "though the subdomains are split across many IPs.",
            DEFAULT_FORCE_QUEUE));
    t.setOverrideable(true);
    t.setExpertSetting(true);
    t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
            "This field must contain only alphanumeric "
                    + "characters plus period, dash, comma, colon, or underscore."));
    t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
            "If true then the Frontier will use a seperate "
                    + "datastructure to detect and eliminate duplicates.\n"
                    + "This is required for Canonicalization rules to work.",
            DEFAULT_USE_URI_UNIQ_FILTER));
    t.setExpertSetting(true);
    t.setOverrideable(false);
    // Read the list of permissible choices from heritrix.properties.
    // Its a list of space- or comma-separated values.
    String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
            HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                    + BucketQueueAssignmentPolicy.class.getName() + " "
                    + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                    + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
    Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
    String[] queues = p.split(queueStr);
    if (queues.length <= 0) {
        throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
    }
    t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
            "Defines how to assign URIs to queues. Can assign by host, "
                    + "by ip, and into one of a fixed set of buckets (1k). NOTE: "
                    + "Use of policies other than the default "
                    + "HostnameQueueAssignmentPolicy is untested and provided "
                    + "for use at your own risk. Further, changing this policy "
                    + "during a crawl, or between restarts using the same data "
                    + "directory, is likely to cause unrecoverable problems.",
            DEFAULT_QUEUE_ASSIGNMENT_POLICY, queues));
    t.setExpertSetting(true);

    // Register persistent CrawlURI items 
    CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
    CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
}

From source file:edu.cornell.mannlib.vitro.webapp.utils.jena.JenaIngestUtils.java

/**
 * Splits values for a given data property URI on a supplied regex and 
 * asserts each value using newPropertyURI.  New statements returned in
 * a Jena Model.  Split values may be optionally trim()ed.
 * @param inModel//from w  w  w .  ja v  a  2 s.com
 * @param propertyURI
 * @param splitRegex
 * @param newPropertyURI
 * @param trim
 * @return outModel
 */
public Model splitPropertyValues(Model inModel, String propertyURI, String splitRegex, String newPropertyURI,
        boolean trim) {
    Model outModel = ModelFactory.createDefaultModel();
    Pattern delimiterPattern = Pattern.compile(splitRegex);
    Property theProp = ResourceFactory.createProperty(propertyURI);
    Property newProp = ResourceFactory.createProperty(newPropertyURI);
    inModel.enterCriticalSection(Lock.READ);
    try {
        StmtIterator stmtIt = inModel.listStatements((Resource) null, theProp, (RDFNode) null);
        try {
            while (stmtIt.hasNext()) {
                Statement stmt = stmtIt.nextStatement();
                Resource subj = stmt.getSubject();
                RDFNode obj = stmt.getObject();
                if (obj.isLiteral()) {
                    Literal lit = (Literal) obj;
                    String unsplitStr = lit.getLexicalForm();
                    String[] splitPieces = delimiterPattern.split(unsplitStr);
                    for (int i = 0; i < splitPieces.length; i++) {
                        String newLexicalForm = splitPieces[i];
                        if (trim) {
                            newLexicalForm = newLexicalForm.trim();
                        }
                        if (newLexicalForm.length() > 0) {
                            Literal newLiteral = null;
                            if (lit.getDatatype() != null) {
                                newLiteral = outModel.createTypedLiteral(newLexicalForm, lit.getDatatype());
                            } else {
                                if (lit.getLanguage() != null) {
                                    newLiteral = outModel.createLiteral(newLexicalForm, lit.getLanguage());
                                } else {
                                    newLiteral = outModel.createLiteral(newLexicalForm);
                                }
                            }
                            outModel.add(subj, newProp, newLiteral);
                        }
                    }
                }
            }
        } finally {
            stmtIt.close();
        }
    } finally {
        inModel.leaveCriticalSection();
    }
    return outModel;
}

From source file:com.krawler.formbuilder.servlet.workflowHandler.java

private void writeXml(JSONObject jsonobj, String containerId, String processId, JSONObject linejson)
        throws TransformerConfigurationException, TransformerException, JSONException {
    JSONArray jarr = jsonobj.getJSONArray("data");
    JSONArray linearr = linejson.getJSONArray("data");
    String split = ":";
    for (int i = 0; i < jarr.length(); i++) {

        Pattern p = Pattern.compile(split);

        JSONObject jobj = jarr.getJSONObject(i);
        String id = jobj.getString("id");
        String value = jobj.getString("value");
        String[] ObjectVal = null;

        ObjectVal = p.split(value);
        if (ObjectVal[0].equals("process-swim")) {
            ObjectInfo obj = new ObjectInfo();
            obj.objId = ObjectVal[1];/*from w w  w .  ja v a 2  s  . c  om*/
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.type = "Pool";
            obj.handId = "";
            obj.refId = ObjectVal[8];
            this.poolContainer.add(obj);
        } else if (ObjectVal[0].equals("lane-swim")) {
            ObjectInfo obj = new ObjectInfo();
            obj.objId = ObjectVal[1];
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.processId = ObjectVal[8];
            obj.refId = ObjectVal[9];
            obj.type = "Lane";
            obj.handId = "";
            this.poolContainer.add(obj);
        } else {
            ObjectInfo obj = new ObjectInfo();
            if (ObjectVal[0].equals("task-activity")) {
                obj.type = "task";
            } else if (ObjectVal[0].equals("start")) {
                obj.type = "start";
                obj.handId = ObjectVal[7];
            } else if (ObjectVal[0].equals("end")) {
                obj.type = "end";
                obj.handId = ObjectVal[7];
            }
            obj.objId = ObjectVal[1];
            obj.name = ObjectVal[2];
            obj.xpos = ObjectVal[3];
            obj.ypos = ObjectVal[4];
            obj.width = ObjectVal[5];
            obj.height = ObjectVal[6];
            obj.parentId = ObjectVal[7];
            obj.processId = ObjectVal[8];
            obj.refId = ObjectVal[9];
            obj.hasStart = ObjectVal[10];
            obj.hasEnd = ObjectVal[11];
            obj.startRefId = ObjectVal[12];
            obj.endRefId = ObjectVal[13];
            obj.derivationRule = ObjectVal[14];
            obj.domEl = ObjectVal[15];
            this.taskContainer.add(obj);
        }
    }

    Element rootElet = dom.createElement("Package");
    rootElet.setAttribute("xmlns", "http://www.wfmc.org/2008/XPDL2.1");
    dom.appendChild(rootElet);
    Element ele = dom.createElement("PackageHeader");
    Element childElement = dom.createElement("XPDLVersion");
    Text text = dom.createTextNode("2.1");
    childElement.appendChild(text);
    ele.appendChild(childElement);
    rootElet.appendChild(ele);

    addPools(rootElet, containerId, processId);

    addWorkflow(rootElet, processId, linearr);

}