Commits

Sam Adams  committed 502a10a

Moved applications to bibdata project

  • Participants
  • Parent commits 0779c2d

Comments (0)

Files changed (12)

File applications/pom.xml

-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-    <modelVersion>4.0.0</modelVersion>
-
-    <parent>
-        <groupId>uk.ac.cam.ch.wwmm</groupId>
-        <artifactId>pub-crawler</artifactId>
-        <version>0.4-SNAPSHOT</version>
-    </parent>
-
-    <artifactId>pub-crawler-apps</artifactId>
-
-    <dependencies>
-
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>acs</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>elsevier</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>nature</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>iucr</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>rsc</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>springer</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm.pubcrawler</groupId>
-            <artifactId>wiley</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-
-        <dependency>
-            <groupId>uk.ac.cam.ch.wwmm</groupId>
-            <artifactId>archiver</artifactId>
-            <version>0.4-SNAPSHOT</version>
-        </dependency>
-
-    </dependencies>
-
-    <build>
-        <plugins>
-            <plugin>
-                <artifactId>maven-assembly-plugin</artifactId>
-                <version>2.3</version>
-                <executions>
-                    <execution>
-                        <id>assembly</id>
-                        <phase>package</phase>
-                        <goals>
-                            <goal>single</goal>
-                        </goals>
-                    </execution>
-                </executions>
-                <configuration>
-                    <descriptorRefs>
-                        <descriptorRef>jar-with-dependencies</descriptorRef>
-                    </descriptorRefs>
-                </configuration>
-            </plugin>
-        </plugins>
-    </build>
-
-</project>
-

File applications/src/main/java/wwmm/pubcrawler/main/AcsBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.acs.AcsCrawlerModule;
-import wwmm.pubcrawler.crawlers.acs.tasks.AcsBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class AcsBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return AcsBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new AcsCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new AcsBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/java/wwmm/pubcrawler/main/CrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Guice;
-import com.google.inject.Injector;
-import com.google.inject.Module;
-import com.mongodb.DB;
-import com.mongodb.Mongo;
-import wwmm.pubcrawler.controller.CrawlerExecutor;
-import wwmm.pubcrawler.controller.ResumeTask;
-import wwmm.pubcrawler.inject.HttpFetcherModule;
-import wwmm.pubcrawler.inject.MongoRepositoryModule;
-import wwmm.pubcrawler.inject.PubcrawlerModule;
-import wwmm.pubserver.archiver.ArchiveProcessor;
-import wwmm.pubserver.archiver.PubserverArchiverModule;
-
-import java.net.UnknownHostException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-/**
- * @author Sam Adams
- */
-public abstract class CrawlerApplication {
-    
-    public void run() throws UnknownHostException {
-        final String host = System.getProperty("pubcrawler.mongo.host", "localhost");
-        final String pubDbName = System.getProperty("pubcrawler.mongo.bibdb", "bib");
-        final String taskDbName = System.getProperty("pubcrawler.mongo.taskdb", "task");
-        final String httpDbName = System.getProperty("pubcrawler.mongo.httpdb", "http");
-
-        final Mongo mongo = new Mongo(host);
-        final DB pubdb = mongo.getDB(pubDbName);
-        final DB taskdb = mongo.getDB(taskDbName);
-        final DB httpdb = mongo.getDB(httpDbName);
-
-        final Injector injector = Guice.createInjector(
-            new PubcrawlerModule(),
-            new HttpFetcherModule(httpdb),
-            new MongoRepositoryModule(pubdb, taskdb),
-            new PubserverArchiverModule(),
-            getPublisherModule()
-        );
-
-        final ExecutorService taskExecutor = Executors.newSingleThreadExecutor();
-        final ExecutorService archiveExecutor = Executors.newSingleThreadExecutor();
-
-        final ArchiveProcessor archiveProcessor = injector.getInstance(ArchiveProcessor.class);
-        archiveExecutor.submit(archiveProcessor);
-
-        final Class<? extends Runnable> seederType = getSeederType();
-        final Runnable seedRunner = injector.getInstance(seederType);
-        taskExecutor.submit(seedRunner);
-
-        final Runnable resumeRunner = injector.getInstance(ResumeTask.class);
-        taskExecutor.submit(resumeRunner);
-
-        final CrawlerExecutor crawlRunner = injector.getInstance(CrawlerExecutor.class);
-        taskExecutor.submit(crawlRunner);
-
-        // Wait for CrawlRunner to complete and stop executor service
-        taskExecutor.shutdown();
-    }
-
-    protected abstract Module getPublisherModule();
-
-    protected abstract Class<? extends Runnable> getSeederType();
-}

File applications/src/main/java/wwmm/pubcrawler/main/ElsevierBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.elsevier.ElsevierCrawlerModule;
-import wwmm.pubcrawler.crawlers.elsevier.tasks.ElsevierBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class ElsevierBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return ElsevierBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new ElsevierCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new ElsevierBibliographyCrawlerApplication().run();
-    }
-
-}

File applications/src/main/java/wwmm/pubcrawler/main/Export.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Guice;
-import com.google.inject.Injector;
-import com.mongodb.DB;
-import com.mongodb.Mongo;
-import uk.ac.cam.ch.wwmm.httpcrawler.CrawlerResponse;
-import uk.ac.cam.ch.wwmm.httpcrawler.HttpFetcher;
-import wwmm.pubcrawler.inject.HttpFetcherModule;
-
-import java.io.*;
-import java.net.UnknownHostException;
-
-/**
- * @author Sam Adams
- */
-public class Export {
-
-    public static HttpFetcher getFetcher() throws UnknownHostException {
-        final String host = System.getProperty("pubcrawler.mongo.host", "localhost");
-        final String httpDbName = System.getProperty("pubcrawler.mongo.httpdb", "http");
-
-        final Mongo mongo = new Mongo(host);
-        final DB httpdb = mongo.getDB(httpDbName);
-
-        final Injector injector = Guice.createInjector(
-            new HttpFetcherModule(httpdb)
-        );
-
-        return injector.getInstance(HttpFetcher.class);
-    }
-
-    public static void main(final String[] args) throws IOException {
-        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-        String key = in.readLine();
-
-        HttpFetcher fetcher = getFetcher();
-        CrawlerResponse response = fetcher.fetchFromCache(key);
-        if (response != null) {
-            PrintStream out = new PrintStream(new File("export.html"), "UTF-8");
-            out.print(response.getEntityAsString());
-            out.close();
-        }
-    }
-    
-}

File applications/src/main/java/wwmm/pubcrawler/main/IucrBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.acta.IucrCrawlerModule;
-import wwmm.pubcrawler.crawlers.acta.tasks.IucrBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class IucrBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return IucrBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new IucrCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new IucrBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/java/wwmm/pubcrawler/main/NatureBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.nature.NatureCrawlerModule;
-import wwmm.pubcrawler.crawlers.nature.tasks.NatureBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class NatureBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return NatureBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new NatureCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new NatureBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/java/wwmm/pubcrawler/main/RscBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.nature.NatureCrawlerModule;
-import wwmm.pubcrawler.crawlers.nature.tasks.NatureBibliographyCrawlSeedTask;
-import wwmm.pubcrawler.crawlers.rsc.RscCrawlerModule;
-import wwmm.pubcrawler.crawlers.rsc.tasks.RscBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class RscBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return RscBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new RscCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new RscBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/java/wwmm/pubcrawler/main/SpringerBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.springer.SpringerCrawlerModule;
-import wwmm.pubcrawler.crawlers.springer.tasks.SpringerBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class SpringerBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return SpringerBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new SpringerCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new SpringerBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/java/wwmm/pubcrawler/main/WileyBibliographyCrawlerApplication.java

-package wwmm.pubcrawler.main;
-
-import com.google.inject.Module;
-import wwmm.pubcrawler.crawlers.wiley.WileyCrawlerModule;
-import wwmm.pubcrawler.crawlers.wiley.tasks.WileyBibliographyCrawlSeedTask;
-
-/**
- * @author Sam Adams
- */
-public class WileyBibliographyCrawlerApplication extends CrawlerApplication {
-
-    @Override
-    protected Class<? extends Runnable> getSeederType() {
-        return WileyBibliographyCrawlSeedTask.class;
-    }
-
-    @Override
-    protected Module getPublisherModule() {
-        return new WileyCrawlerModule();
-    }
-
-    public static void main(final String[] args) throws Exception {
-        new WileyBibliographyCrawlerApplication().run();
-    }
-}

File applications/src/main/resources/log4j.properties

-log4j.rootLogger=INFO, CONSOLE, FILE, FILE_ERR
-
-log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
-log4j.appender.CONSOLE.target=System.err
-log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
-log4j.appender.CONSOLE.layout.ConversionPattern=%d [%t] %-5p %c %x - %m%n
-
-log4j.appender.FILE=org.apache.log4j.RollingFileAppender
-log4j.appender.FILE.file=pubcrawler.log
-log4j.appender.FILE.maximumFileSize=10MB
-log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
-log4j.appender.FILE.layout.ConversionPattern=%d [%t] %-5p %c %x - %m%n
-
-log4j.appender.FILE_ERR=org.apache.log4j.RollingFileAppender
-log4j.appender.FILE_ERR.file=error.log
-log4j.appender.FILE_ERR.maximumFileSize=10MB
-log4j.appender.FILE_ERR.threshold=WARN
-log4j.appender.FILE_ERR.layout=org.apache.log4j.PatternLayout
-log4j.appender.FILE_ERR.layout.ConversionPattern=%d [%t] %-5p %c %x - %m%n
-
-log4j.logger.org.apache.http.client.protocol.ResponseProcessCookies=ERROR
-log4j.logger.uk.ac.cam.ch.wwmm.httpcrawler=ERROR
     <packaging>pom</packaging>
 
     <modules>
-        <module>applications</module>
         <module>core</module>
         <module>data</module>
         <module>publishers</module>