1. wwmm
  2. pub-crawler

Commits

Sam Adams  committed e824ec6 Draft

Tidying up

  • Participants
  • Parent commits 70f9753
  • Branches default

Comments (0)

Files changed (7)

File core/src/main/java/wwmm/pubcrawler/crawler/CrawlRunner.java

View file
 /**
  * @author Sam Adams
  */
-public interface CrawlRunner<Data> {
+public interface CrawlRunner<T> {
 
-    void run(String id, Data data) throws Exception;
+    void run(String id, T data) throws Exception;
 
 }

File core/src/main/java/wwmm/pubcrawler/crawler/common/HtmlCrawler.java

-package wwmm.pubcrawler.crawler.common;
-
-import nu.xom.Builder;
-import nu.xom.Document;
-import nu.xom.ParsingException;
-import uk.ac.cam.ch.wwmm.httpcrawler.CrawlerResponse;
-
-import java.io.IOException;
-import java.io.InputStreamReader;
-
-/**
- * @author Sam Adams
- */
-public abstract class HtmlCrawler {
-
-    protected static Document readDocument(final CrawlerResponse response) throws IOException {
-        final String encoding = response.getCharacterEncoding();
-        if (encoding == null) {
-            return readDocument(response, newTagSoupBuilder());
-        } else {
-            return readDocument(response, newTagSoupBuilder(), encoding);
-        }
-    }
-
-    protected static Document readDocument(final CrawlerResponse response, final Builder builder) throws IOException {
-        try {
-            final Document doc = builder.build(response.getContent());
-            setDocBaseUrl(response, doc);
-            return doc;
-        } catch (ParsingException e) {
-            throw new IOException("Error reading XML", e);
-        }
-    }
-
-    protected static Document readDocument(final CrawlerResponse response, final Builder builder, final String encoding) throws IOException {
-        try {
-            final InputStreamReader isr = new InputStreamReader(response.getContent(), encoding);
-            final Document doc = builder.build(isr);
-            setDocBaseUrl(response, doc);
-            return doc;
-        } catch (ParsingException e) {
-            throw new IOException("Error reading XML", e);
-        }
-    }
-
-    protected static void setDocBaseUrl(final CrawlerResponse response, final Document doc) {
-        final String url = response.getUrl().toString();
-        doc.setBaseURI(removeFragment(url));
-    }
-
-    protected static String removeFragment(final String url) {
-        final int index = url.indexOf('#');
-        return (index == -1) ? url : url.substring(0, index);
-    }
-
-    protected static Builder newTagSoupBuilder() {
-        return new Builder(new org.ccil.cowan.tagsoup.Parser());
-    }
-
-}

File core/src/main/java/wwmm/pubcrawler/http/Fetcher.java

View file
 /**
  * @author Sam Adams
  */
-public interface Fetcher<Request,Resource> {
+public interface Fetcher<R, T> {
     
-    Resource fetch(Request request) throws Exception;
+    T fetch(R request) throws Exception;
     
 }

File core/src/main/java/wwmm/pubcrawler/processors/ArticleProcessor.java

View file
  * @author Sam Adams
  */
 @Singleton
-public class ArticleProcessor<Resource> implements ResourceProcessor<Resource, ArticleCrawlTaskData> {
+public class ArticleProcessor<R> implements ResourceProcessor<R, ArticleCrawlTaskData> {
 
     private static final Logger LOG = LoggerFactory.getLogger(ArticleProcessor.class);
 
     private final ArticleArchiver articleArchiver;
     private final ArticleHandler articleHandler;
-    private final ArticleParserFactory<Resource> parserFactory;
+    private final ArticleParserFactory<R> parserFactory;
 
     @Inject
-    public ArticleProcessor(final ArticleArchiver articleArchiver, final ArticleHandler articleHandler, final ArticleParserFactory<Resource> parserFactory) {
+    public ArticleProcessor(final ArticleArchiver articleArchiver, final ArticleHandler articleHandler, final ArticleParserFactory<R> parserFactory) {
         this.articleArchiver = articleArchiver;
         this.articleHandler = articleHandler;
         this.parserFactory = parserFactory;
     }
 
     @Override
-    public void process(final String taskId, final ArticleCrawlTaskData data, final Resource resource) {
+    public void process(final String taskId, final ArticleCrawlTaskData data, final R resource) {
         final ArticleId articleRef = data.getArticleRef();
         final ArticleParser parser = parserFactory.createArticleParser(articleRef, resource);
         handleArticle(taskId, parser);

File core/src/main/java/wwmm/pubcrawler/processors/IssueListProcessor.java

View file
  * @author Sam Adams
  */
 @Singleton
-public class IssueListProcessor<Resource, TaskData extends IssueListCrawlTaskData> implements ResourceProcessor<Resource, TaskData> {
+public class IssueListProcessor<R, T extends IssueListCrawlTaskData> implements ResourceProcessor<R, T> {
 
-    private final IssueListParserFactory<Resource> parserFactory;
+    private final IssueListParserFactory<R> parserFactory;
     private final IssueHandler issueHandler;
 
     @Inject
-    public IssueListProcessor(final IssueListParserFactory<Resource> parserFactory, final IssueHandler issueHandler) {
+    public IssueListProcessor(final IssueListParserFactory<R> parserFactory, final IssueHandler issueHandler) {
         this.parserFactory = parserFactory;
         this.issueHandler = issueHandler;
     }
 
     @Override
-    public void process(final String taskId, final TaskData data, final Resource resource) {
+    public void process(final String taskId, final T data, final R resource) {
         final PublisherId publisherId = new PublisherId(data.getPublisher());
         final JournalId journalId = new JournalId(publisherId, data.getJournal());
 

File core/src/main/java/wwmm/pubcrawler/processors/IssueTocProcessor.java

View file
  * @author Sam Adams
  */
 @Singleton
-public class IssueTocProcessor<Resource> implements ResourceProcessor<Resource,IssueTocCrawlTaskData> {
+public class IssueTocProcessor<R> implements ResourceProcessor<R, IssueTocCrawlTaskData> {
 
     private static final Logger LOG = LoggerFactory.getLogger(IssueTocProcessor.class);
 
     private final ArticleArchiver articleArchiver;
     private final IssueHandler issueHandler;
     private final ArticleHandler articleHandler;
-    private final IssueTocParserFactory<Resource> parserFactory;
+    private final IssueTocParserFactory<R> parserFactory;
 
     @Inject
-    public IssueTocProcessor(final IssueArchiver issueArchiver, final IssueHandler issueHandler, final ArticleArchiver articleArchiver, final ArticleHandler articleHandler, final IssueTocParserFactory<Resource> parserFactory) {
+    public IssueTocProcessor(final IssueArchiver issueArchiver, final IssueHandler issueHandler, final ArticleArchiver articleArchiver, final ArticleHandler articleHandler, final IssueTocParserFactory<R> parserFactory) {
         this.issueArchiver = issueArchiver;
         this.issueHandler = issueHandler;
         this.articleArchiver = articleArchiver;
     }
 
     @Override
-    public void process(final String taskId, final IssueTocCrawlTaskData data, final Resource resource) {
+    public void process(final String taskId, final IssueTocCrawlTaskData data, final R resource) {
         final PublisherId publisherId = new PublisherId(data.getPublisher());
         final JournalId journalId = new JournalId(publisherId, data.getJournal());
         final IssueTocParser parser = parserFactory.createIssueTocParser(journalId, resource);

File core/src/main/java/wwmm/pubcrawler/processors/PublicationListProcessor.java

View file
  * @author Sam Adams
  */
 @Singleton
-public class PublicationListProcessor<Resource> implements ResourceProcessor<Resource, HttpCrawlTaskData> {
+public class PublicationListProcessor<R> implements ResourceProcessor<R, HttpCrawlTaskData> {
 
     private static final Logger LOG = LoggerFactory.getLogger(PublicationListProcessor.class);
 
-    private final PublicationListParserFactory<Resource> parserFactory;
+    private final PublicationListParserFactory<R> parserFactory;
     private final JournalArchiver journalArchiver;
     private final JournalHandler journalHandler;
 
     @Inject
-    public PublicationListProcessor(final PublicationListParserFactory<Resource> parserFactory, final JournalArchiver journalArchiver, final JournalHandler journalHandler) {
+    public PublicationListProcessor(final PublicationListParserFactory<R> parserFactory, final JournalArchiver journalArchiver, final JournalHandler journalHandler) {
         this.parserFactory = parserFactory;
         this.journalArchiver = journalArchiver;
         this.journalHandler = journalHandler;
     }
 
     @Override
-    public void process(final String taskId, final HttpCrawlTaskData httpCrawlTask, final Resource resource) {
+    public void process(final String taskId, final HttpCrawlTaskData httpCrawlTask, final R resource) {
         final PublicationListParser parser = parserFactory.createPublicationListParser(resource);
         final List<Journal> journals = parser.findJournals();
         for (final Journal journal : journals) {