Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input) 

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:com.openlopd.agpd.nota.ws.NotaWebservice.java

public String registrarXml(FileDataBase xmlFile) throws Exception {
    String endpoint = "https://www.aespd.es:443/agenciapd/axis/SolicitudService?wsdl";

    if (logger.isInfoEnabled()) {
        logger.info("Iniciando notificacin en el entorno de {}.", entorno);
    }/*from   w w  w  .  j  a v  a2 s .  c  om*/

    Service service = new Service();
    Call call = (Call) service.createCall();

    call.setTargetEndpointAddress(new java.net.URL(endpoint));

    if (entorno.equals(Entornos.exp.name())) {
        // Operacin para el sistema en produccin.
        call.setOperationName(new QName("http://soapinterop.org/", "registrarXml"));
    } else {
        // Operacin para el sistema en pruebas.
        call.setOperationName(new QName("http://soapinterop.org/", "probarXml"));
    }

    // Codificacin y envo del fichero.
    String codecFile = Base64.encode(xmlFile.getFile());
    String ret = (String) call.invoke(new Object[] { codecFile });
    // TODO: Hay que verificar que la firma de esto es correcta.
    return StringEscapeUtils.unescapeHtml4(new String(Base64.decode(ret), "ISO-8859-1"));
    //return new String(Base64.decode(ret), "ISO-8859-1");
}

From source file:com.nttec.everychan.chans.cirno.Chan410Reader.java

@Override
protected void parseDate(String date) {
    super.parseDate(date);
    if (currentPost.timestamp == 0) {
        Matcher matcher = SPAN_ADMIN_PATTERN.matcher(date);
        if (matcher.matches()) {
            currentPost.trip = (currentPost.trip == null ? "" : currentPost.trip)
                    + StringEscapeUtils.unescapeHtml4(matcher.group(1).trim());
            super.parseDate(matcher.group(2));
        }//  w w  w .  j av a 2  s. c  om
    }
}

From source file:com.wellsandwhistles.android.redditsp.reddit.prepared.RedditParsedPost.java

public String getUnescapedSelfText() {
    return StringEscapeUtils.unescapeHtml4(mSrc.selftext);
}

From source file:gr.demokritos.iit.cru.creativity.reasoning.semantic.WebMiner.java

public static String WebMiner(String seed, int difficulty, String language, boolean compactForm)
        throws ClassNotFoundException, SQLException, IOException, InstantiationException,
        IllegalAccessException {/*from   w ww .j a va2 s .co  m*/
    Gson gson = new Gson();
    Connect c = new Connect(language);
    RandomWordGenerator r = new RandomWordGenerator(c);
    String randomPhrase = r.selectRandomWord(seed, difficulty).replace(",", " ");
    InfoSummarization inf = new InfoSummarization(c);
    LinkedHashMap<String, Double> TagCloud = new LinkedHashMap<String, Double>();

    Set<String> pages = new HashSet<String>();
    ArrayList<String> urls = new ArrayList<String>();
    ArrayList<String> urls_temp = new ArrayList<String>();
    if (language.equalsIgnoreCase("en")) {
        if (randomPhrase.length() == 0) {
            randomPhrase = seed;
        }
        String bingAppId = c.getBingAppId();
        BingCrawler bc = new BingCrawler(bingAppId, language);
        urls_temp = bc.crawl(randomPhrase);
        int url_loop = 0;
        while ((url_loop < 5) && (url_loop < urls_temp.size())) {
            urls.add(urls_temp.get(url_loop));
            url_loop++;
        }
    } else if (language.equalsIgnoreCase("el")) {
        String bingAppId = c.getBingAppId();
        BingCrawler bc = new BingCrawler(bingAppId, language);
        urls_temp = bc.crawl(randomPhrase);
        int url_loop = 0;
        while ((url_loop < 5) && (url_loop < urls_temp.size())) {
            urls.add(urls_temp.get(url_loop));
            url_loop++;
        }
    } else if (language.equalsIgnoreCase("de")) {//keep only the first word of the random phrase for search
        if (randomPhrase.length() == 0) {
            randomPhrase = seed;
        }
        urls_temp = HTMLUtilities.linkExtractor(
                "http://www.fragfinn.de/kinderliste/suche?start=0&query=" + randomPhrase.split(" ")[0], "UTF-8",
                0);

        for (String url : urls_temp) {
            urls.add(StringEscapeUtils.unescapeHtml4(url));
            if (urls.size() == 5) {
                break;
            }
        }
    }
    String delims = "[{} .,;?!():\"]+";

    String[] words = randomPhrase.split(",");
    String[] user_keywords = seed.split(delims);
    if (urls.size() > 0) {
        ExecutorService threadPool = Executors.newFixedThreadPool(urls.size());
        for (String url : urls) {
            threadPool.submit(new HTMLPages(url, pages, language)); //stopWordSet, tokensHashMap,language));
            // threadPool.submit(HTMLTokenizer());
        }
        threadPool.shutdown();
        while (!threadPool.isTerminated()) {

        }

        LinkedHashMap<ArrayList<String>, Double> temp = inf.TopTermsBing(pages, compactForm);
        HashMap<String, Double> temp2 = new HashMap<String, Double>();
        for (ArrayList<String> stems : temp.keySet()) {
            for (int j = 0; j < stems.size(); j++) {
                String s = stems.get(j).split("\\{")[0];
                s = s.replace(",", " ");
                s = s.trim();

                boolean wordnet = true;
                //if term is not one of the initial random phrase
                for (int i = 0; i < words.length; i++) {
                    if (s.equalsIgnoreCase(words[i])) {
                        wordnet = false;
                    }
                }
                //and if it 's not in the initial words of user
                for (int i = 0; i < user_keywords.length; i++) {
                    if (s.equalsIgnoreCase(user_keywords[i])) {
                        wordnet = false;
                    }
                }
                //in german or greek, ignore english words from search english words
                if (language.equalsIgnoreCase("de") || language.equalsIgnoreCase("el")) {
                    if (c.getWn().getCommonPos(s) != null) {
                        continue;
                    }
                }
                //return it with its stem's weight
                if (wordnet) {
                    //for every stem, put each of its corresponding terms to tagCloud with the stem's tf
                    temp2.put(stems.get(j), temp.get(stems));
                }
            }
        }
        TagCloud = inf.sortHashMapByValues(temp2);
        threadPool.shutdownNow();
    }
    String json = gson.toJson(TagCloud);
    c.CloseConnection();
    return json;
}

From source file:com.joey.Fujikom.common.mapper.JsonMapper.java

public JsonMapper(Include include) {
    // ?/*from  ww  w.  j av a 2 s . c  o  m*/
    if (include != null) {
        this.setSerializationInclusion(include);
    }
    // ?????????
    this.enableSimple();
    // JSONJava
    this.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
    // ?
    this.getSerializerProvider().setNullValueSerializer(new JsonSerializer<Object>() {
        @Override
        public void serialize(Object value, JsonGenerator jgen, SerializerProvider provider)
                throws IOException, JsonProcessingException {
            jgen.writeString("");
        }
    });
    // HTML?
    this.registerModule(new SimpleModule().addSerializer(String.class, new JsonSerializer<String>() {
        @Override
        public void serialize(String value, JsonGenerator jgen, SerializerProvider provider)
                throws IOException, JsonProcessingException {
            jgen.writeString(StringEscapeUtils.unescapeHtml4(value));
        }
    }));
    // 
    this.setTimeZone(TimeZone.getDefault());//getTimeZone("GMT+8:00")
}

From source file:fr.mcc.ginco.solr.TermSolrConverter.java

/**
 * Convert a Thesaurus Term into a SolrDocument
 *
 * @param thesaurusTerm/*from  w ww  .  j a  v a  2 s. com*/
 * @return SolrInputDocument
 */

public SolrInputDocument convertSolrTerm(ThesaurusTerm thesaurusTerm) {
    SolrInputDocument doc = new SolrInputDocument();
    doc.addField(SolrField.THESAURUSID, thesaurusTerm.getThesaurusId());
    doc.addField(SolrField.THESAURUSTITLE, thesaurusTerm.getThesaurus().getTitle());
    doc.addField(SolrField.IDENTIFIER, thesaurusTerm.getIdentifier());
    doc.addField(SolrField.LEXICALVALUE,
            StringEscapeUtils.unescapeHtml4(thesaurusTerm.getLexicalValue().replace("&apos;", "'")));
    doc.addField(SolrField.TYPE, ThesaurusTerm.class.getSimpleName());
    doc.addField(SolrField.LANGUAGE, thesaurusTerm.getLanguage().getId());
    if (thesaurusTerm.getConcept() != null) {
        doc.addField(SolrField.CONCEPTID, thesaurusTerm.getConcept().getIdentifier());
    }

    boolean preferred;

    if (thesaurusTerm.getPrefered() == null) {
        preferred = false;
    } else {
        preferred = thesaurusTerm.getPrefered();
    }

    if (preferred) {
        doc.addField(SolrField.EXT_TYPE, ExtEntityType.TERM_PREF);
    } else {
        doc.addField(SolrField.EXT_TYPE, ExtEntityType.TERM_NON_PREF);
    }

    Timestamp modifiedDate = new Timestamp(thesaurusTerm.getModified().getTime());
    doc.addField(SolrField.MODIFIED, modifiedDate);

    Timestamp createdDate = new Timestamp(thesaurusTerm.getCreated().getTime());
    doc.addField(SolrField.CREATED, createdDate);

    doc.addField(SolrField.STATUS, thesaurusTerm.getStatus());

    List<Note> notes = noteService.getTermNotePaginatedList(thesaurusTerm.getIdentifier(), 0, 0);
    for (Note note : notes) {
        doc.addField(SolrField.NOTES, note.getLexicalValue());
    }
    return doc;
}

From source file:gov.llnl.ontology.text.corpora.UkWacDocumentReader.java

/**
 * {@inheritDoc}/* w  w w .  ja v a  2  s.  c o  m*/
 */
public gov.llnl.ontology.text.Document readDocument(String doc, String corpusName) {
    String[] lines = doc.split("\\n");

    // Find the title.
    int titleStart = lines[0].indexOf("id=\"") + 4;
    int titleEnd = lines[0].lastIndexOf("\">");
    String key = lines[0].substring(titleStart, titleEnd);
    long id = key.hashCode();

    StringBuilder builder = new StringBuilder();
    for (int i = 1; i < lines.length - 1; ++i) {
        // Skip empty lines and xml tags.
        if (lines[i].length() == 0 || lines[i].endsWith("s>"))
            continue;

        lines[i] = StringEscapeUtils.unescapeHtml4(lines[i]);
        String[] toks = lines[i].split("\\s+");
        builder.append(toks[0]).append(" ");
    }

    return new SimpleDocument(corpusName, builder.toString(), doc, key, id, key, new HashSet<String>());
}

From source file:com.dalthed.tucan.TucanMobile.java

/**
 * Gibt bei einem String wie "04-00-0126-vu&nbsp;Mathematik 1 (f&uuml;r ET)" "Mathematik 1 (f&uuml;r ET)" zurck
 * @param evNameString//from w  w  w.ja  va 2  s . c  o  m
 * @return
 */
public static String getEventNameByString(String evNameString) {
    String[] evNameAr = nbspPat.split(evNameString);

    if (evNameAr.length == 2) {

        return StringEscapeUtils.unescapeHtml4(evNameAr[1]);
    } else {
        return evNameString;
    }
}

From source file:com.datumbox.framework.utilities.text.cleaners.HTMLCleaner.java

public static String extractText(String text) {
    //return Jsoup.parse(text).text();
    text = replaceImgWithAlt(text);//from w ww  .  ja  v  a2 s  .c  o  m
    text = safeRemoveAllTags(text);

    text = StringEscapeUtils.unescapeHtml4(text);

    return text;
}

From source file:gr.demokritos.iit.textforms.TextForm.java

/**
 * Remove HTML strings from the text/*from  w w w.  j  ava 2 s  .  com*/
 */
protected void removeHTML() {
    this.text = this.text.replaceAll("&amp;", "and");
    this.text = StringEscapeUtils.unescapeHtml4(this.text);
}