Snippets

Yclept Nemo Initial html content extractor for recoll

Updated by Yclept Nemo

File Makefile Modified

  • Ignore whitespace
  • Hide word diff
 all: BoilerpipeHandler.jar
 
-BoilerpipeHandler.jar: BoilerpipeHandler.manifest BoilerpipeHandler.class BoilerpipeArguments.class CheckedInputStream.class UnrecognizedArgumentException.class ExclusiveOptionsException.class BoilerpipeInterface.class
+BoilerpipeHandler.jar: BoilerpipeHandler.manifest BoilerpipeHandler.class BoilerpipeArguments.class CheckedInputStream.class UnrecognizedArgumentException.class ExclusiveOptionsException.class BoilerpipeInterface.class ByteArrayChannel.class
 	jar -cfm $@ $^
 
 BoilerpipeHandler.class: BoilerpipeHandler.java
Updated by Yclept Nemo

File BoilerpipeHandler.java Modified

  • Ignore whitespace
  • Hide word diff
 import java.util.Locale;
 import java.io.*;
 import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.channels.ClosedChannelException;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
+import java.net.URL;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
 // External:
 import org.apache.commons.cli.*;
 import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
 import org.mozilla.universalchardet.UniversalDetector;
 
 
+class ByteArrayChannel implements SeekableByteChannel {
+    private int position_current = 0;
+    private int position_buffer = 0;
+    private byte[] buffer = new byte[8192];
+    private byte[] bytes;
+    private boolean closed = false;
+
+    public ByteArrayChannel() {
+        bytes = new byte[0];
+    }
+
+    public ByteArrayChannel(byte [] bytes) {
+        this.bytes = bytes.clone();
+    }
+
+    public long position() throws ClosedChannelException {
+        check_closed();
+
+        return position_current;
+    }
+
+    public ByteArrayChannel position(long position_new_long) throws ClosedChannelException, ArithmeticException, IllegalArgumentException {
+        check_closed();
+
+        int position_new = Math.toIntExact(position_new_long);
+
+        if (position_new < 0) {
+            throw new IllegalArgumentException("size must not be negative");
+        }
+        position_current = position_new;
+
+        return this;
+    }
+
+    public int read(ByteBuffer dst) throws ClosedChannelException {
+        check_closed();
+
+        int read = 0;
+
+        while (dst.hasRemaining()) {
+            if (position_current < bytes.length) {
+                int length = Math.min(dst.remaining(), bytes.length - position_current);
+                dst.put
+                    ( bytes
+                    , position_current
+                    , length
+                    );
+                read += length;
+                position_current += length;
+            }
+            else {
+                int position_relative = position_current - bytes.length;
+                if (position_relative >= position_buffer) {
+                    break;
+                }
+                int length = Math.min(dst.remaining(), position_buffer - position_relative);
+                dst.put
+                    ( buffer
+                    , position_relative
+                    , length
+                    );
+                read += length;
+                position_current += length;
+            }
+        }
+
+        if (read > 0) {
+            return read;
+        } else {
+            return -1;
+        }
+    }
+
+    public long size() throws ClosedChannelException {
+        return size_int();
+    }
+
+    private int size_int() throws ClosedChannelException {
+        check_closed();
+
+        return bytes.length + position_buffer;
+    }
+
+    public ByteArrayChannel truncate(long size_target_long) throws ClosedChannelException, ArithmeticException, IllegalArgumentException {
+        check_closed();
+
+        int size_current = size_int();
+        int size_target = Math.toIntExact(size_target_long);
+
+        if (size_target < 0) {
+            throw new IllegalArgumentException("size must not be negative");
+        }
+        else if (size_target >= size_current) {
+            return this;
+        }
+        else if (size_target < bytes.length) {
+            byte[] bytes_new = new byte[size_target];
+
+            System.arraycopy(bytes, 0, bytes_new, 0, size_target);
+
+            bytes = bytes_new;
+
+            position_buffer = 0;
+        }
+        else {
+            position_buffer = size_target - bytes.length;
+        }
+
+        size_current = size_int();
+
+        if (position_current > size_current) {
+            position_current = size_current;
+        }
+
+        return this;
+    }
+
+    public int write (ByteBuffer src) throws ClosedChannelException {
+        check_closed();
+
+        int written = 0;
+
+        while (src.hasRemaining()) {
+            if (position_current < bytes.length) {
+                int length = Math.min(src.remaining(), bytes.length - position_current);
+                src.get
+                    ( bytes
+                    , position_current
+                    , length
+                    );
+                written += length;
+                position_current += length;
+            }
+            else {
+                int position_relative = position_current - bytes.length;
+                int length = Math.min(src.remaining(), buffer.length - position_relative);
+                src.get
+                    ( buffer
+                    , position_relative
+                    , length
+                    );
+                written += length;
+                position_current += length;
+                position_relative += length;
+                position_buffer = Math.max(position_buffer, position_relative);
+                grow();
+            }
+        }
+
+        return written;
+    }
+
+    public byte[] toByteArray() {
+        byte[] bytes_new = new byte[bytes.length + position_buffer];
+
+        System.arraycopy(bytes, 0, bytes_new, 0, bytes.length);
+        System.arraycopy(buffer, 0, bytes_new, bytes.length, position_buffer);
+
+        return bytes_new;
+    }
+
+    private void grow() {
+        if (position_buffer < buffer.length) {
+            return;
+        }
+
+        bytes = toByteArray();
+
+        position_buffer = 0;
+    }
+
+    public boolean isOpen() {
+        return !closed;
+    }
+
+    public void close() {
+        closed = true;
+    }
+
+    private void check_closed() throws ClosedChannelException {
+        if (closed) {
+            throw new ClosedChannelException();
+        }
+    }
+}
+
 // CheckedInputStream:
 //  author:
 //      rinde
 class BoilerpipeArguments {
     // from org.jsoup.helper.DataUtil, as we are re-implementing jsoup's
     // encoding detection. See below for more details.
-    private static final Pattern charset_pattern = Pattern.compile("(?i)\\bcharset=\\s*(?:\"|')?([^\\s,;\"']+)");
+    private static final Pattern content_charset_pattern = Pattern.compile("(?i)\\bcharset=\\s*(?:\"|')?([^\\s,;\"']+)");
+
+    private static final Pattern content_types_pattern = Pattern.compile("(?i)\\b([^\\s,;:\\/]+)\\s*\\/\\s*([^\\s,;:\\/]+)");
 
     private static final Map<byte[], String> map_bom;
     static {
             .required(false)
             .build();
 
+        Option option_url = Option.builder("u")
+            .longOpt("url")
+            .desc("the \"input-path\" argument is a URL")
+            .required(false)
+            .build();
+
         Option option_markup = Option.builder("m")
             .longOpt("markup")
             .desc("extract content with enclosing markup (incompatible with: \"highlight\", \"images\")")
         options.addOption(option_output);
         options.addOption(option_extractor);
         options.addOption(option_wrap);
+        options.addOption(option_url);
 
         options.addOptionGroup(option_group_boilerpipe);
     }
 
     private         Boolean             help            = null;
     private         Boolean             wrap            = null;
+    private         Boolean             url             = null;
 
     private         BufferedWriter      output          = null;
-    private         RandomAccessFile    input           = null;
+    private         SeekableByteChannel input           = null;
     private         ExtractorBase       extractor       = null;
+    private         HttpURLConnection   connection      = null;
     private         Charset             output_charset  = Charset.forName("UTF-8");
     private         Charset             input_charset   = null;
 
         return this.wrap.booleanValue();
     }
 
+    public boolean url() {
+        if (this.url == null) {
+            set_url();
+        }
+        return this.url.booleanValue();
+    }
+
     public BufferedWriter output() throws IOException {
         if (this.output == null) {
             set_output();
         return this.output;
     }
 
-    public RandomAccessFile input() throws ParseException, IOException {
+    public SeekableByteChannel input() throws ParseException, IOException {
         if (this.input == null) {
             set_input();
         }
     }
 
     public CheckedInputStream input_stream() throws ParseException, IOException {
-        return new CheckedInputStream(new FileInputStream(this.input().getFD()));
+        return new CheckedInputStream(Channels.newInputStream(input()));
     }
 
     // Setters
         }
     }
 
+    private void set_url() {
+        if (this.commandline.hasOption("u")) {
+            this.url = Boolean.TRUE;
+        } else {
+            this.url = Boolean.FALSE;
+        }
+    }
+
     private void set_output() throws IOException {
         String output_path = "";
         boolean output_path_use = false;
             throw new MissingArgumentException("Missing parameter: input-path");
         }
 
-        this.input = new RandomAccessFile(parameters.get(0), "r");
+        if (url()) {
+            set_input_url(parameters.get(0));
+        } else {
+            set_input_file(parameters.get(0));
+        }
+    }
+
+    private void set_input_file(String path) throws IOException {
+        this.input = FileChannel.open
+            ( Paths.get(path)
+            , StandardOpenOption.READ
+            );
+    }
+
+    private void set_input_url(String url) throws IOException {
+        HttpURLConnection connection = (HttpURLConnection)(new URL(url).openConnection());
+
+        connection.setRequestProperty("User-Agent", "Mozilla/5.0");
+        connection.setRequestProperty("Accept", "text/*");
+        connection.setRequestProperty("Accept-Encoding", "identity");
+        connection.connect();
+
+        int status = connection.getResponseCode();
+
+        if (status != HttpURLConnection.HTTP_OK) {
+            String status_message = connection.getResponseMessage();
+            String message;
+            if (status_message != null) {
+                message = String.format("HTTP Error %d: %s", status, status_message);
+            } else {
+                message = String.format("HTTP Error %d", status);
+            }
+            throw new IOException(message);
+        }
+
+        String content_encoding = connection.getContentEncoding();
+
+        if (content_encoding != null && !content_encoding.trim().equalsIgnoreCase("identity")) {
+            String message = String.format
+                ( "Expected text, but content is %s encoded"
+                , content_encoding
+                );
+            throw new IOException(message);
+        }
+
+        String content_type = connection.getContentType();
+
+        String content_type_type;
+        
+        if (content_type == null) {
+            content_type_type = null;
+        } else {
+            content_type_type = type_from_content_type(content_type, false);
+        }
+
+        if (content_type_type == null || !content_type_type.equalsIgnoreCase("text")) {
+            String message = "Expected text, but content-type is" +
+                (content_type == null
+                 ? " undeclared"
+                 : ": " + content_type
+                 );
+            throw new IOException(message);
+        }
+
+        int read;
+
+        byte[] buffer = new byte[8192];
+
+        ByteArrayOutputStream stream_output = new ByteArrayOutputStream(buffer.length);
+
+        InputStream stream_input = connection.getInputStream();
+
+        while ((read = stream_input.read(buffer)) > 0) {
+            stream_output.write(buffer, 0, read);
+        }
+
+        stream_input.close();
+        stream_output.flush();
+
+        connection.disconnect();
+
+        this.input = new ByteArrayChannel(stream_output.toByteArray());
+        this.connection = connection;
     }
 
     private void set_extractor() throws ParseException, IOException {
             encoding = encoding_from_bom(stream1);
         }
 
-        this.input().seek(0);
+        this.input().position(0);
+
+        if (encoding == null && url()) {
+            assert (connection != null);
+            encoding = encoding_from_http(connection);
+        }
+
+        this.input().position(0);
 
         if (encoding == null) {
             encoding = encoding_from_html(stream2);
         }
 
-        this.input().seek(0);
+        this.input().position(0);
 
         if (encoding == null) {
             encoding = encoding_detect(stream3);
         }
 
-        this.input().seek(0);
+        this.input().position(0);
 
         if (encoding == null) {
             encoding = Charset.defaultCharset().name();
         }
 
-        this.input().seek(0);
+        this.input().position(0);
 
         assert !stream1.validated;
         assert !stream2.validated;
         return encoding;
     }
 
+    static String encoding_from_http(HttpURLConnection connection) {
+        String content_type = connection.getContentType();
+
+        String encoding = null;
+
+        if (content_type != null) {
+            encoding = encoding_from_content_type(content_type);
+        }
+
+        if (encoding != null && !encoding_is_legal(encoding)) {
+            encoding = null;
+        }
+
+        return encoding;
+    }
+
     static String encoding_from_content_type(String s) {
-        Matcher charset_matcher = charset_pattern.matcher(s);
+        Matcher charset_matcher = content_charset_pattern.matcher(s);
 
         String charset = null;
 
-        if (charset_matcher.matches()) {
+        if (charset_matcher.find()) {
             charset = charset_matcher.group(1);
             assert !charset.isEmpty();
         }
         return charset;
     }
 
+    static String type_from_content_type(String s, boolean subtype) {
+        Matcher types_matcher = content_types_pattern.matcher(s);
+
+        String type = null;
+
+        if (types_matcher.find()) {
+            type = types_matcher.group(subtype ? 2 : 1);
+            assert !type.isEmpty();
+        }
+
+        return type;
+    }
+
     static boolean encoding_is_legal(String s) {
         boolean legal = true;
 
             , reader_a
             );
 
-        arguments.input().seek(0);
+        arguments.input().position(0);
 
         String output;
 
 
         stream.invalidate();
 
-        arguments.input().seek(0);
+        arguments.input().position(0);
 
         return output;
     }
 
     static Document process_wrap_shell(BoilerpipeArguments arguments)
         throws ParseException,
-               IOException,
-               BoilerpipeProcessingException {
+               IOException {
         Charset charset = arguments.input_charset();
 
         CheckedInputStream stream = arguments.input_stream();
 
         stream.invalidate();
 
-        arguments.input().seek(0);
+        arguments.input().position(0);
 
         Document document_target = Document.createShell("");
 
Updated by Yclept Nemo

File BoilerpipeHandler.manifest Modified

  • Ignore whitespace
  • Hide word diff
 Manifest-version: 1.0
 Main-Class: BoilerpipeHandler
-Class-Path: /usr/share/java/apache-commons-cli.jar /usr/share/java/boilerpipe.jar /usr/share/java/nekohtml.jar /usr/share/java/xerces-j2.jar /usr/share/java/jsoup/jsoup.jar /usr/share/java/juniversalchardet.jar
+Class-Path: /usr/share/java/apache-commons-cli.jar /usr/share/java/jsoup/jsoup.jar /usr/share/java/juniversalchardet.jar /usr/share/java/xerces-j2.jar nekohtml-relocated-1.9.13.jar boilerpipe-common-2.0-SNAPSHOT.jar

File Makefile Modified

  • Ignore whitespace
  • Hide word diff
 all: BoilerpipeHandler.jar
 
-BoilerpipeHandler.jar: BoilerpipeHandler.manifest BoilerpipeHandler.class BoilerpipeArguments.class CheckedInputStream.class UnrecognizedArgumentException.class
+BoilerpipeHandler.jar: BoilerpipeHandler.manifest BoilerpipeHandler.class BoilerpipeArguments.class CheckedInputStream.class UnrecognizedArgumentException.class ExclusiveOptionsException.class BoilerpipeInterface.class
 	jar -cfm $@ $^
 
 BoilerpipeHandler.class: BoilerpipeHandler.java
-	javac -cp "/usr/share/java/apache-commons-cli.jar:/usr/share/java/boilerpipe.jar:/usr/share/java/jsoup/jsoup.jar:/usr/share/java/juniversalchardet.jar:." BoilerpipeHandler.java
+	javac -cp "/usr/share/java/apache-commons-cli.jar:/usr/share/java/jsoup/jsoup.jar:/usr/share/java/juniversalchardet.jar:boilerpipe-common-2.0-SNAPSHOT.jar" BoilerpipeHandler.java
 
 clean:
 	rm *.jar *.class
Updated by Yclept Nemo

File BoilerpipeHandler.java Modified

  • Ignore whitespace
  • Hide word diff
 import java.nio.charset.UnsupportedCharsetException;
 // External:
 import org.apache.commons.cli.*;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.extractors.*;
+import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
+import com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput;
+import com.kohlschutter.boilerpipe.sax.HTMLHighlighter;
+import com.kohlschutter.boilerpipe.sax.ImageExtractor;
+import com.kohlschutter.boilerpipe.document.TextDocument;
+import com.kohlschutter.boilerpipe.document.Image;
+import com.kohlschutter.boilerpipe.extractors.*;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
 import org.jsoup.Jsoup;
-import org.jsoup.nodes.*;
+import org.jsoup.parser.Tag;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Document;
 import org.mozilla.universalchardet.UniversalDetector;
 
 
     }
 }
 
+class ExclusiveOptionsException extends ParseException {
+    private Option option_a;
+    private Option option_b;
+
+    public ExclusiveOptionsException(String message) {
+        super(message);
+    }
+
+    public ExclusiveOptionsException(Option option_a, Option option_b) {
+        this(String.format
+                ( "Latter option \"%s\" conflicts with former option \"%s\""
+                , option_a.getLongOpt() == null
+                    ? option_a.getOpt()
+                    : option_a.getLongOpt()
+                , option_b.getLongOpt() == null
+                    ? option_b.getOpt()
+                    : option_b.getLongOpt()
+                )
+        );
+        this.option_a = option_a;
+        this.option_b = option_b;
+    }
+
+    public Option getOptionFormer() {
+        return option_a;
+    }
+
+    public Option getOptionLatter() {
+        return option_b;
+    }
+}
+
+class BoilerpipeInterface {
+    private final TextDocument document;
+    private final ExtractorBase extractor;
+
+    // Constructor
+
+    public BoilerpipeInterface(ExtractorBase extractor, String string) throws BoilerpipeProcessingException {
+        this(extractor, new StringReader(string));
+    }
+
+    public BoilerpipeInterface(ExtractorBase extractor, Reader reader) throws BoilerpipeProcessingException {
+        this(extractor, new InputSource(reader));
+    }
+
+    public BoilerpipeInterface(ExtractorBase extractor, InputSource source) throws BoilerpipeProcessingException {
+        try {
+            this.document = new BoilerpipeSAXInput(source).getTextDocument();
+        }
+        catch (SAXException e) {
+            throw new BoilerpipeProcessingException(e);
+        }
+        this.extractor = extractor;
+        this.extractor.process(this.document);
+    }
+
+    // text()
+
+    public String text() {
+        return document.getContent();
+    }
+
+    // html()
+
+    public String html(String string) throws BoilerpipeProcessingException {
+        return html(new StringReader(string));
+    }
+
+    public String html(Reader reader) throws BoilerpipeProcessingException {
+        return html(new InputSource(reader));
+    }
+
+    public String html(InputSource source) throws BoilerpipeProcessingException {
+        return HTMLHighlighter.newExtractingInstance().process(document, source);
+    }
+
+    // highlight()
+
+    public String highlight(String string) throws BoilerpipeProcessingException {
+        return highlight(new StringReader(string));
+    }
+
+    public String highlight(Reader reader) throws BoilerpipeProcessingException {
+        return highlight(new InputSource(reader));
+    }
+
+    public String highlight(InputSource source) throws BoilerpipeProcessingException {
+        return HTMLHighlighter.newHighlightingInstance().process(document, source);
+    }
+
+    // images()
+
+    public List<Image> images(String string) throws BoilerpipeProcessingException {
+        return images(new StringReader(string));
+    }
+
+    public List<Image> images(Reader reader) throws BoilerpipeProcessingException {
+        return images(new InputSource(reader));
+    }
+
+    public List<Image> images(InputSource source) throws BoilerpipeProcessingException {
+        return ImageExtractor.INSTANCE.process(this.document, source);
+    }
+}
+
 class BoilerpipeArguments {
     // from org.jsoup.helper.DataUtil, as we are re-implementing jsoup's
     // encoding detection. See below for more details.
         map_bom = Collections.unmodifiableMap(map_bom_t);
     }
 
-    private CommandLine         commandline;
-    private Boolean             help            = null;
-    private Boolean             html            = null;
-    private BufferedWriter      output          = null;
-    private RandomAccessFile    input           = null;
-    private ExtractorBase       extractor       = null;
-    private Charset             output_charset  = Charset.forName("UTF-8");
-    private Charset             input_charset   = null;
+    private static final Options options = new Options();
+    static {
+        Option option_help = Option.builder("h")
+            .longOpt("help")
+            .desc("display help")
+            .required(false)
+            .build();
 
-    // Constructor
+        Option option_output = Option.builder("o")
+            .longOpt("output")
+            .desc("output path")
+            .required(false)
+            .hasArg()
+            .argName("output-path")
+            .build();
 
-    public BoilerpipeArguments(CommandLine commandline) {
+        Option option_extractor = Option.builder("e")
+            .longOpt("extractor")
+            .desc("extractor, default 'ArticleExtractor'")
+            .required(false)
+            .hasArg()
+            .argName("extractor")
+            .build();
+
+        Option option_wrap = Option.builder("w")
+            .longOpt("wrap")
+            .desc("wrap output with html derived from the original document (incompatible with: \"highlight\")")
+            .required(false)
+            .build();
+
+        Option option_markup = Option.builder("m")
+            .longOpt("markup")
+            .desc("extract content with enclosing markup (incompatible with: \"highlight\", \"images\")")
+            .required(false)
+            .build();
+
+        Option option_highlight = Option.builder("l")
+            .longOpt("highlight")
+            .desc("highlight content blocks of the original document (incompatible with: \"markup\", \"images\" + \"wrap\")")
+            .required(false)
+            .build();
+
+        Option option_images = Option.builder("i")
+            .longOpt("images")
+            .desc("extract content-enclosed images as links (incompatible with: \"markup\", \"highlight\")")
+            .required(false)
+            .build();
+
+        OptionGroup option_group_boilerpipe = new OptionGroup();
+
+        option_group_boilerpipe.addOption(option_markup);
+        option_group_boilerpipe.addOption(option_highlight);
+        option_group_boilerpipe.addOption(option_images);
+
+        options.addOption(option_help);
+        options.addOption(option_output);
+        options.addOption(option_extractor);
+        options.addOption(option_wrap);
+
+        options.addOptionGroup(option_group_boilerpipe);
+    }
+
+    private final   CommandLine         commandline;
+
+    private         Boolean             markup          = null;
+    private         Boolean             highlight       = null;
+    private         Boolean             images          = null;
+
+    private         Boolean             help            = null;
+    private         Boolean             wrap            = null;
+
+    private         BufferedWriter      output          = null;
+    private         RandomAccessFile    input           = null;
+    private         ExtractorBase       extractor       = null;
+    private         Charset             output_charset  = Charset.forName("UTF-8");
+    private         Charset             input_charset   = null;
+
+    // Constructors
+
+    public BoilerpipeArguments(CommandLine commandline) throws ParseException {
         this.commandline = commandline;
+
+        check();
+    }
+
+    public BoilerpipeArguments(String[] arguments) throws ParseException {
+        this(new DefaultParser().parse(options, arguments));
+    }
+
+    // Getters (static)
+
+    public static Options options() {
+        assert (options!= null);
+        return options;
     }
 
     // Getters
 
+    public boolean markup() {
+        if (this.markup == null) {
+            set_markup();
+        }
+        return this.markup.booleanValue();
+    }
+
+    public boolean highlight() {
+        if (this.highlight == null) {
+            set_highlight();
+        }
+        return this.highlight.booleanValue();
+    }
+
+    public boolean images() {
+        if (this.images == null) {
+            set_images();
+        }
+        return this.images.booleanValue();
+    }
+
     public boolean help() {
         if (this.help == null) {
             set_help();
         return this.help.booleanValue();
     }
 
-    public boolean html() {
-        if (this.html == null) {
-            set_html();
+    public boolean wrap() {
+        if (this.wrap == null) {
+            set_wrap();
         }
-        return this.html.booleanValue();
+        return this.wrap.booleanValue();
     }
 
     public BufferedWriter output() throws IOException {
 
     // Setters
 
+    private void set_markup() {
+        if (this.commandline.hasOption("m")) {
+            this.markup = Boolean.TRUE;
+        } else {
+            this.markup = Boolean.FALSE;
+        }
+    }
+
+    private void set_highlight() {
+        if (this.commandline.hasOption("l")) {
+            this.highlight = Boolean.TRUE;
+        } else {
+            this.highlight = Boolean.FALSE;
+        }
+    }
+
+    private void set_images() {
+        if (this.commandline.hasOption("i")) {
+            this.images = Boolean.TRUE;
+        } else {
+            this.images = Boolean.FALSE;
+        }
+    }
+
     private void set_help() {
         if (this.commandline.hasOption("h")) {
             this.help = Boolean.TRUE;
         }
     }
 
-    private void set_html() {
-        if (this.commandline.hasOption("r")) {
-            this.html = Boolean.TRUE;
+    private void set_wrap() {
+        if (this.commandline.hasOption("w")) {
+            this.wrap = Boolean.TRUE;
         } else {
-            this.html = Boolean.FALSE;
+            this.wrap = Boolean.FALSE;
         }
     }
 
             String extractor_string = this.commandline.getOptionValue("e");
 
             if (!is_valid_identifier(extractor_string)) {
-                throw new UnrecognizedArgumentException(String.format(exception_message, "invalid identifier"));
+                throw new UnrecognizedArgumentException(String.format(exception_message, extractor_string, "invalid identifier"));
             }
 
-            String extractor_package = "de.l3s.boilerpipe.extractors.";
+            String extractor_package = "com.kohlschutter.boilerpipe.extractors.";
 
             // Not all derived classes provide getInstance()
             //try {
                 extractor = ExtractorBase.class.cast(extractor_object);
             }
             catch (ClassNotFoundException|NoSuchFieldException|IllegalAccessException e) {
-                throw new UnrecognizedArgumentException(String.format(exception_message, "no such extractor"));
+                throw new UnrecognizedArgumentException(String.format(exception_message, extractor_string, "no such extractor"));
             }
         }
 
         }
     }
 
+    public void check() throws ExclusiveOptionsException {
+        if (commandline.hasOption("w") && commandline.hasOption("l")) {
+            throw new ExclusiveOptionsException(options.getOption("w"), options.getOption("l"));
+        }
+    }
+
+    // Miscellaneous functions (static)
+
+    public static String help(String application, String error) {
+        StringWriter sw = new StringWriter();
+        StringBuffer sb = sw.getBuffer();
+        PrintWriter  pw = new PrintWriter(sw, true);
+
+        HelpFormatter formatter = new HelpFormatter();
+
+        formatter.printUsage(pw, 80, application, options);
+
+        sb.deleteCharAt(sb.length() - 1);
+        sb.append(" input-path");
+        sb.append("\n\n");
+
+        sb.append("Boilerpipe command-line wrapper");
+        sb.append("\n\n");
+
+        sb.append(String.join("\n"
+                    , "positional arguments:"
+                    , "    input-path"));
+        sb.append("\n\n");
+
+        sb.append("optional arguments:\n");
+
+        formatter.printOptions(pw, 80, options, 4, 4);
+
+        if (error != null && error.length() > 0) {
+            sb.append("\n" + error + "\n");
+        }
+
+        return sw.toString();
+    }
+
+
     // Support Methods (static)
 
     static boolean is_valid_identifier(String s) {
         html_metadata = String.join(", ", metadata_tags);
     }
 
+    private static boolean body_markup = false;
+
     static String error_string(String component, String error) {
         return String.join(": ", "Error", component, error);
     }
         System.err.println(error_string(component, error));
     }
 
-    static void help(Options options, String error) {
-        StringWriter sw = new StringWriter();
-        StringBuffer sb = sw.getBuffer();
-        PrintWriter  pw = new PrintWriter(sw, true);
-
-        HelpFormatter formatter = new HelpFormatter();
-
-        formatter.printUsage(pw, 80, "BoilerpipeHandler", options);
-
-        sb.deleteCharAt(sb.length() - 1);
-        sb.append(" input-path");
-        sb.append("\n\n");
-
-        sb.append("Boilerpipe command-line wrapper");
-        sb.append("\n\n");
-
-        sb.append(String.join("\n"
-                    , "positional arguments:"
-                    , "    input-path"));
-        sb.append("\n\n");
-
-        sb.append("optional arguments:\n");
-
-        formatter.printOptions(pw, 80, options, 4, 4);
-
-        if (error != null && error.length() > 0) {
-            sb.append("\n" + error_string("Command-line", error) + "\n");
-        }
-
-        System.err.print(sw.toString());
+    static void help(String error) {
+        System.err.print
+            ( BoilerpipeArguments.help
+                ( "BoilerpipeHandler"
+                , error != null && error.length() > 0
+                    ? error_string("Command-line", error)
+                    : null
+                )
+            );
     }
 
-    static Options args_build() {
-        Options options = new Options();
-
-        Option option_help = Option.builder("h")
-            .longOpt("help")
-            .desc("display help")
-            .required(false)
-            .build();
-
-        Option option_output = Option.builder("o")
-            .longOpt("output")
-            .desc("output path")
-            .required(false)
-            .hasArg()
-            .argName("output-path")
-            .build();
-
-        Option option_extractor = Option.builder("e")
-            .longOpt("extractor")
-            .desc("extractor, default 'ArticleExtractor'")
-            .required(false)
-            .hasArg()
-            .argName("extractor")
-            .build();
-
-        Option option_html = Option.builder("r")
-            .longOpt("html")
-            .desc("html output suitable for recoll")
-            .required(false)
-            .build();
-
-        options.addOption(option_help);
-        options.addOption(option_output);
-        options.addOption(option_extractor);
-        options.addOption(option_html);
-
-        return options;
+    static void process_arguments(BoilerpipeArguments arguments) {
+        if (arguments.help()) {
+            help(null);
+            System.exit(0);
+        }
     }
 
-    static CommandLine args_parse(Options options, String[] args) throws ParseException {
-        CommandLineParser parser = new DefaultParser();
-
-        CommandLine commandline = parser.parse(options, args);
-
-        return commandline;
-    }
+    static String process_boilerpipe_images(List<Image> images) {
+        Element container = new Element(Tag.valueOf("body"), "");
 
-    static void process_help(BoilerpipeArguments arguments, Options options) {
-        if (arguments.help()) {
-            help(options, null);
-            System.exit(0);
+        for (Image image : images) {
+            Element child = new Element(Tag.valueOf("img"), "");
+            if (image.getSrc() != null) {
+                child.attr("src", image.getSrc());
+            }
+            if (image.getWidth() != null) {
+                child.attr("width", image.getWidth());
+            }
+            if (image.getHeight() != null) {
+                child.attr("height", image.getHeight());
+            }
+            if (image.getAlt() != null) {
+                child.attr("alt", image.getAlt());
+            }
+            container.appendChild(child);
         }
+        
+        return container.html();
     }
 
     static String process_boilerpipe(BoilerpipeArguments arguments)
 
         CheckedInputStream stream = arguments.input_stream();
 
-        BufferedReader reader = new BufferedReader
+        BufferedReader reader_a = new BufferedReader
+            ( new InputStreamReader
+                ( stream
+                , charset
+                )
+            );
+
+        BufferedReader reader_b = new BufferedReader
             ( new InputStreamReader
                 ( stream
                 , charset
                 )
             );
 
-        String text = arguments.extractor().getText(reader);
+        BoilerpipeInterface boilerpipe_interface = new BoilerpipeInterface
+            ( arguments.extractor()
+            , reader_a
+            );
+
+        arguments.input().seek(0);
+
+        String output;
+
+        if (arguments.markup()) {
+            output = boilerpipe_interface.html(reader_b);
+
+            body_markup = true;
+        }
+        else if (arguments.highlight()) {
+            output = boilerpipe_interface.highlight(reader_b);
+
+            body_markup = true;
+        }
+        else if (arguments.images()) {
+            List<Image> images = boilerpipe_interface.images(reader_b);
+            output = process_boilerpipe_images(images);
+
+            body_markup = true;
+        }
+        else {
+            output = boilerpipe_interface.text();
+
+            body_markup = false;
+        }
 
         stream.invalidate();
 
         arguments.input().seek(0);
 
-        return text;
+        return output;
     }
 
-    static String process_html(BoilerpipeArguments arguments)
+    static Document process_wrap_shell(BoilerpipeArguments arguments)
         throws ParseException,
                IOException,
                BoilerpipeProcessingException {
             }
         }
 
-        String text = process_boilerpipe(arguments);
+        return document_target;
+    }
+
+    static String process_wrap(BoilerpipeArguments arguments)
+        throws ParseException,
+               IOException,
+               BoilerpipeProcessingException {
+        Document document = process_wrap_shell(arguments);
+
+        String contents = process_boilerpipe(arguments);
 
-        Element body = document_target.getElementsByTag("body").first();
+        Element body = document.getElementsByTag("body").first();
 
         assert (body != null);
 
-        body.text(text);
+        if (body_markup) {
+            body.html(contents);
+        } else {
+            body.text(contents);
+        }
 
-        return document_target.toString();
+        return document.toString();
     }
 
     static String ensure_eol_terminated(String s) {
         return s;
     }
 
-    public static void main(String[] args) {
-        Options options = args_build();
-
+    public static void main(String[] argv) {
         try {
-            CommandLine commandline = args_parse(options, args);
-
-            BoilerpipeArguments arguments = new BoilerpipeArguments(commandline);
+            BoilerpipeArguments arguments = new BoilerpipeArguments(argv);
 
-            process_help(arguments, options);
+            process_arguments(arguments);
 
-            String text;
+            String output = process_boilerpipe(arguments);
 
-            if (arguments.html()) {
-                text = process_html(arguments);
-            } else {
-                text = process_boilerpipe(arguments);
+            if (arguments.wrap()) {
+                output = process_wrap(arguments);
             }
 
-            text = ensure_eol_terminated(text);
+            output = ensure_eol_terminated(output);
 
-            arguments.output().write(text);
+            arguments.output().write(output);
 
             arguments.close();
         }
         catch (ParseException e) {
-            help(options, e.getMessage());
+            help(e.getMessage());
             System.exit(1);
         }
         catch (IOException e) {
Updated by Yclept Nemo

File BoilerpipeHandler.manifest Added

  • Ignore whitespace
  • Hide word diff
+Manifest-version: 1.0
+Main-Class: BoilerpipeHandler
+Class-Path: /usr/share/java/apache-commons-cli.jar /usr/share/java/boilerpipe.jar /usr/share/java/nekohtml.jar /usr/share/java/xerces-j2.jar /usr/share/java/jsoup/jsoup.jar /usr/share/java/juniversalchardet.jar

File BoilerpipeHandler.txt Deleted

  • Ignore whitespace
  • Hide word diff
-Manifest-version: 1.0
-Main-Class: BoilerpipeHandler
-Class-Path: /usr/share/java/apache-commons-cli.jar /usr/share/java/boilerpipe.jar /usr/share/java/nekohtml.jar /usr/share/java/xerces-j2.jar /usr/share/java/jsoup/jsoup.jar /usr/share/java/juniversalchardet.jar
  1. 1
  2. 2
HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.