Java tutorial
/* * This file is part of CoAnSys project. * Copyright (c) 2012-2015 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package pl.edu.icm.coansys.source; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Reducer; import pl.edu.icm.coansys.models.DocumentProtos; import pl.edu.icm.coansys.models.ParentModelProtos; /** * * @author kura */ public class ReduceDocsWithSameSourceId extends Reducer<Writable, BytesWritable, Text, BytesWritable> { @Override protected void reduce(Writable key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException { ArrayList<String> docIds = new ArrayList<String>(); HashSet<String> issns = new HashSet<String>(); HashSet<String> isbns = new HashSet<String>(); HashSet<String> titles = new HashSet<String>(); for (BytesWritable value : values) { DocumentProtos.DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); docIds.add(docWrapper.getRowId()); String issn = null; String isbn = null; if (docWrapper.hasDocumentMetadata() && docWrapper.getDocumentMetadata().hasBasicMetadata()) { DocumentProtos.BasicMetadataOrBuilder bm = docWrapper.getDocumentMetadata() .getBasicMetadataOrBuilder(); if (bm.hasIssn()) { issn = bm.getIssn(); } if (bm.hasIsbn()) { isbn = bm.getIsbn(); } if (StringUtils.isNotBlank(isbn)) { isbn = isbn.trim().toUpperCase(); isbns.add(isbn); } if (StringUtils.isNotBlank(issn)) { issn = issn.trim().toUpperCase(); issns.add(issn); } if (bm.hasJournal()) { String title = bm.getJournal().trim(); if (StringUtils.isNotBlank(title)) { titles.add(title); } } } } String issn = null; String isbn = null; ArrayList<String> titleA = new ArrayList<String>(titles); Collections.sort(titleA, new Comparator<String>() { @Override public int compare(String o1, String o2) { //najdusze naprzd return o2.length() - o1.length(); } }); if (issns.size() > 0) { issn = issns.iterator().next(); } if (isbns.size() > 0) { isbn = isbns.iterator().next(); } String id = "http://comac.ceon.pl/source-"; if (issn != null) { id += ("issn-" + issn); if (isbn != null) { id += "-"; } } if (isbn != null) { id += ("isbn-" + isbn); } for (String docId : docIds) { ParentModelProtos.ParentDisambiguationOut.Builder parent = ParentModelProtos.ParentDisambiguationOut .newBuilder(); parent.setDocId(docId); parent.setParentId(id); parent.addAllParentName(titles); if (isbn != null) { parent.setType(DocumentProtos.BasicMetadata.ParentType.BOOK); } else { parent.setType(DocumentProtos.BasicMetadata.ParentType.JOURNAL); } context.write(new Text(docId), new BytesWritable(parent.build().toByteArray())); } } }