Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.examples.crawl; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.FileUtils; import org.junit.Before; import org.junit.Test; import bixo.config.BixoPlatform; import bixo.config.BixoPlatform.Platform; import bixo.datum.UrlDatum; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import com.scaleunlimited.cascading.BasePath; import com.scaleunlimited.cascading.BasePlatform; public class LatestUrlDatumBufferTest { private static final String WORKINGDIR = "build/test/LatestUrlDatumBufferTest"; @Before public void setUp() throws IOException { File workingFolder = new File(WORKINGDIR); if (workingFolder.exists()) { FileUtils.deleteDirectory(workingFolder); } } /* Can't use the test below since it doesn't simulate the reusing of tuples in a Cascading * GroupBy operation. * In particular it will fail to catch a case where an assignment of the type * aDatum = datum * is being incorrectly done. * Instead we want it to be * aDatum = new DatumType(datum) */ /* @Test public void testOperate() throws BaseFetchException, IOException { LatestUrlDatumBuffer op = new LatestUrlDatumBuffer(); HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class); Mockito.when(fp.getJobConf()).thenReturn(new JobConf()); OperationCall<NullContext> oc = Mockito.mock(OperationCall.class); BufferCall<NullContext> bc = Mockito.mock(BufferCall.class); TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class); List<TupleEntry> tupleEntryList = new ArrayList<TupleEntry>(); UrlDatum urlDatum1 = new UrlDatum("http://foo.com"); urlDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); urlDatum1.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED); TupleEntry entry1 = new TupleEntry(UrlDatum.FIELDS); entry1.setTuple(urlDatum1.getTuple()); tupleEntryList.add(entry1); UrlDatum urlDatum2 = new UrlDatum("http://foo.com"); urlDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L); urlDatum2.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.FETCHED); TupleEntry entry2 = new TupleEntry(UrlDatum.FIELDS); entry2.setTuple(urlDatum2.getTuple()); tupleEntryList.add(entry2); UrlDatum urlDatum3 = new UrlDatum("http://foo.com"); urlDatum3.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); urlDatum3.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED); TupleEntry entry3 = new TupleEntry(UrlDatum.FIELDS); entry3.setTuple(urlDatum3.getTuple()); tupleEntryList.add(entry3); Mockito.when(bc.getArgumentsIterator()).thenReturn(tupleEntryList.iterator()); Mockito.when(bc.getOutputCollector()).thenReturn(collector); op.prepare(fp, oc); op.operate(fp, bc); op.cleanup(fp, oc); Mockito.verify(collector, Mockito.times(1)).add(Mockito.argThat(new MatchUrlDatum())); Mockito.verifyNoMoreInteractions(collector); } private static class MatchUrlDatum extends ArgumentMatcher<Tuple> { @Override public boolean matches(Object argument) { TupleEntry entry = new TupleEntry(UrlDatum.FIELDS); entry.setTuple((Tuple)argument); UrlDatum datum = new UrlDatum(entry); Long expectedVal = new Long(2); Long result = (Long)datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD); if (result.longValue() == expectedVal.longValue()) { return true; } return false; } } */ @SuppressWarnings({ "rawtypes", "unchecked" }) @Test public void testOperateWithGroupBy() throws Exception { BixoPlatform platform = new BixoPlatform(LatestUrlDatumBufferTest.class, Platform.Local); // Create a temp file with a fetched url BasePath workingDirPath = platform.makePath(WORKINGDIR); BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched"); ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>(); UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com"); fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L); fetchedDatums.add(fetchedDatum1); createDataFile(platform, fetchedDatumsPath, fetchedDatums); // And another with unfetched urls BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched"); ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>(); UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com"); unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); unfetchedDatums.add(unfetchedDatum1); UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com"); unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); unfetchedDatums.add(unfetchedDatum2); createDataFile(platform, unfetchedDatumsPath, unfetchedDatums); // create a workflow Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath); Pipe fetchedPipe = new Pipe("fetched"); Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath); Pipe unfetchedPipe = new Pipe("unfetched"); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put(fetchedPipe.getName(), inputSource1); sources.put(unfetchedPipe.getName(), inputSource2); BasePath resultsPath = platform.makePath(workingDirPath, "results"); Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE); Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe), new Fields(UrlDatum.URL_FN)); resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); FlowConnector flowConnector = platform.makeFlowConnector(); Flow flow = flowConnector.connect(sources, resultSink, resultsPipe); flow.complete(); // verify that the resulting pipe has the latest tuple Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath); TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess()); int count = 0; long latest = 0; while (reader.hasNext()) { TupleEntry next = reader.next(); UrlDatum datum = new UrlDatum(next); latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD); count++; } assertEquals(1, count); assertEquals(2, latest); } @SuppressWarnings({ "unchecked", "rawtypes" }) private void createDataFile(BasePlatform platform, BasePath filePath, List<UrlDatum> datums) throws Exception { Tap urlSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), filePath, SinkMode.REPLACE); TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess()); for (UrlDatum datum : datums) { writer.add(datum.getTuple()); } writer.close(); } }