import java.nio.charset.UnsupportedCharsetException;
import org.apache.commons.cli.*;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.extractors.*;
+import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
+import com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput;
+import com.kohlschutter.boilerpipe.sax.HTMLHighlighter;
+import com.kohlschutter.boilerpipe.sax.ImageExtractor;
+import com.kohlschutter.boilerpipe.document.TextDocument;
+import com.kohlschutter.boilerpipe.document.Image;
+import com.kohlschutter.boilerpipe.extractors.*;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
-import org.jsoup.nodes.*;
+import org.jsoup.parser.Tag;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Document;
import org.mozilla.universalchardet.UniversalDetector;
+class ExclusiveOptionsException extends ParseException {
+ private Option option_a;
+ private Option option_b;
+ public ExclusiveOptionsException(String message) {
+ public ExclusiveOptionsException(Option option_a, Option option_b) {
+ ( "Latter option \"%s\" conflicts with former option \"%s\""
+ , option_a.getLongOpt() == null
+ : option_a.getLongOpt()
+ , option_b.getLongOpt() == null
+ : option_b.getLongOpt()
+ this.option_a = option_a;
+ this.option_b = option_b;
+ public Option getOptionFormer() {
+ public Option getOptionLatter() {
+class BoilerpipeInterface {
+ private final TextDocument document;
+ private final ExtractorBase extractor;
+ public BoilerpipeInterface(ExtractorBase extractor, String string) throws BoilerpipeProcessingException {
+ this(extractor, new StringReader(string));
+ public BoilerpipeInterface(ExtractorBase extractor, Reader reader) throws BoilerpipeProcessingException {
+ this(extractor, new InputSource(reader));
+ public BoilerpipeInterface(ExtractorBase extractor, InputSource source) throws BoilerpipeProcessingException {
+ this.document = new BoilerpipeSAXInput(source).getTextDocument();
+ catch (SAXException e) {
+ throw new BoilerpipeProcessingException(e);
+ this.extractor = extractor;
+ this.extractor.process(this.document);
+ return document.getContent();
+ public String html(String string) throws BoilerpipeProcessingException {
+ return html(new StringReader(string));
+ public String html(Reader reader) throws BoilerpipeProcessingException {
+ return html(new InputSource(reader));
+ public String html(InputSource source) throws BoilerpipeProcessingException {
+ return HTMLHighlighter.newExtractingInstance().process(document, source);
+ public String highlight(String string) throws BoilerpipeProcessingException {
+ return highlight(new StringReader(string));
+ public String highlight(Reader reader) throws BoilerpipeProcessingException {
+ return highlight(new InputSource(reader));
+ public String highlight(InputSource source) throws BoilerpipeProcessingException {
+ return HTMLHighlighter.newHighlightingInstance().process(document, source);
+ public List<Image> images(String string) throws BoilerpipeProcessingException {
+ return images(new StringReader(string));
+ public List<Image> images(Reader reader) throws BoilerpipeProcessingException {
+ return images(new InputSource(reader));
+ public List<Image> images(InputSource source) throws BoilerpipeProcessingException {
+ return ImageExtractor.INSTANCE.process(this.document, source);
class BoilerpipeArguments {
// from org.jsoup.helper.DataUtil, as we are re-implementing jsoup's
// encoding detection. See below for more details.
map_bom = Collections.unmodifiableMap(map_bom_t);
- private CommandLine commandline;
- private Boolean help = null;
- private Boolean html = null;
- private BufferedWriter output = null;
- private RandomAccessFile input = null;
- private ExtractorBase extractor = null;
- private Charset output_charset = Charset.forName("UTF-8");
- private Charset input_charset = null;
+ private static final Options options = new Options();
+ Option option_help = Option.builder("h")
+ Option option_output = Option.builder("o")
+ .argName("output-path")
- public BoilerpipeArguments(CommandLine commandline) {
+ Option option_extractor = Option.builder("e")
+ .desc("extractor, default 'ArticleExtractor'")
+ Option option_wrap = Option.builder("w")
+ .desc("wrap output with html derived from the original document (incompatible with: \"highlight\")")
+ Option option_markup = Option.builder("m")
+ .desc("extract content with enclosing markup (incompatible with: \"highlight\", \"images\")")
+ Option option_highlight = Option.builder("l")
+ .desc("highlight content blocks of the original document (incompatible with: \"markup\", \"images\" + \"wrap\")")
+ Option option_images = Option.builder("i")
+ .desc("extract content-enclosed images as links (incompatible with: \"markup\", \"highlight\")")
+ OptionGroup option_group_boilerpipe = new OptionGroup();
+ option_group_boilerpipe.addOption(option_markup);
+ option_group_boilerpipe.addOption(option_highlight);
+ option_group_boilerpipe.addOption(option_images);
+ options.addOption(option_help);
+ options.addOption(option_output);
+ options.addOption(option_extractor);
+ options.addOption(option_wrap);
+ options.addOptionGroup(option_group_boilerpipe);
+ private final CommandLine commandline;
+ private Boolean markup = null;
+ private Boolean highlight = null;
+ private Boolean images = null;
+ private Boolean help = null;
+ private Boolean wrap = null;
+ private BufferedWriter output = null;
+ private RandomAccessFile input = null;
+ private ExtractorBase extractor = null;
+ private Charset output_charset = Charset.forName("UTF-8");
+ private Charset input_charset = null;
+ public BoilerpipeArguments(CommandLine commandline) throws ParseException {
this.commandline = commandline;
+ public BoilerpipeArguments(String[] arguments) throws ParseException {
+ this(new DefaultParser().parse(options, arguments));
+ public static Options options() {
+ assert (options!= null);
+ public boolean markup() {
+ if (this.markup == null) {
+ return this.markup.booleanValue();
+ public boolean highlight() {
+ if (this.highlight == null) {
+ return this.highlight.booleanValue();
+ public boolean images() {
+ if (this.images == null) {
+ return this.images.booleanValue();
return this.help.booleanValue();
- public boolean html() {
- if (this.html == null) {
+ public boolean wrap() {
+ if (this.wrap == null) {
- return this.html.booleanValue();
+ return this.wrap.booleanValue();
public BufferedWriter output() throws IOException {
+ private void set_markup() {
+ if (this.commandline.hasOption("m")) {
+ this.markup = Boolean.TRUE;
+ this.markup = Boolean.FALSE;
+ private void set_highlight() {
+ if (this.commandline.hasOption("l")) {
+ this.highlight = Boolean.TRUE;
+ this.highlight = Boolean.FALSE;
+ private void set_images() {
+ if (this.commandline.hasOption("i")) {
+ this.images = Boolean.TRUE;
+ this.images = Boolean.FALSE;
private void set_help() {
if (this.commandline.hasOption("h")) {
this.help = Boolean.TRUE;
- private void set_html() {
- if (this.commandline.hasOption("r")) {
- this.html = Boolean.TRUE;
+ private void set_wrap() {
+ if (this.commandline.hasOption("w")) {
+ this.wrap = Boolean.TRUE;
- this.html = Boolean.FALSE;
+ this.wrap = Boolean.FALSE;
String extractor_string = this.commandline.getOptionValue("e");
if (!is_valid_identifier(extractor_string)) {
- throw new UnrecognizedArgumentException(String.format(exception_message, "invalid identifier"));
+ throw new UnrecognizedArgumentException(String.format(exception_message, extractor_string, "invalid identifier"));
- String extractor_package = "de.l3s.boilerpipe.extractors.";
+ String extractor_package = "com.kohlschutter.boilerpipe.extractors.";
// Not all derived classes provide getInstance()
extractor = ExtractorBase.class.cast(extractor_object);
catch (ClassNotFoundException|NoSuchFieldException|IllegalAccessException e) {
- throw new UnrecognizedArgumentException(String.format(exception_message, "no such extractor"));
+ throw new UnrecognizedArgumentException(String.format(exception_message, extractor_string, "no such extractor"));
+ public void check() throws ExclusiveOptionsException {
+ if (commandline.hasOption("w") && commandline.hasOption("l")) {
+ throw new ExclusiveOptionsException(options.getOption("w"), options.getOption("l"));
+ // Miscellaneous functions (static)
+ public static String help(String application, String error) {
+ StringWriter sw = new StringWriter();
+ StringBuffer sb = sw.getBuffer();
+ PrintWriter pw = new PrintWriter(sw, true);
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printUsage(pw, 80, application, options);
+ sb.deleteCharAt(sb.length() - 1);
+ sb.append(" input-path");
+ sb.append("Boilerpipe command-line wrapper");
+ sb.append(String.join("\n"
+ , "positional arguments:"
+ sb.append("optional arguments:\n");
+ formatter.printOptions(pw, 80, options, 4, 4);
+ if (error != null && error.length() > 0) {
+ sb.append("\n" + error + "\n");
// Support Methods (static)
static boolean is_valid_identifier(String s) {
html_metadata = String.join(", ", metadata_tags);
+ private static boolean body_markup = false;
static String error_string(String component, String error) {
return String.join(": ", "Error", component, error);
System.err.println(error_string(component, error));
- static void help(Options options, String error) {
- StringWriter sw = new StringWriter();
- StringBuffer sb = sw.getBuffer();
- PrintWriter pw = new PrintWriter(sw, true);
- HelpFormatter formatter = new HelpFormatter();
- formatter.printUsage(pw, 80, "BoilerpipeHandler", options);
- sb.deleteCharAt(sb.length() - 1);
- sb.append(" input-path");
- sb.append("Boilerpipe command-line wrapper");
- sb.append(String.join("\n"
- , "positional arguments:"
- sb.append("optional arguments:\n");
- formatter.printOptions(pw, 80, options, 4, 4);
- if (error != null && error.length() > 0) {
- sb.append("\n" + error_string("Command-line", error) + "\n");
- System.err.print(sw.toString());
+ static void help(String error) {
+ ( BoilerpipeArguments.help
+ , error != null && error.length() > 0
+ ? error_string("Command-line", error)
- static Options args_build() {
- Options options = new Options();
- Option option_help = Option.builder("h")
- Option option_output = Option.builder("o")
- .argName("output-path")
- Option option_extractor = Option.builder("e")
- .desc("extractor, default 'ArticleExtractor'")
- Option option_html = Option.builder("r")
- .desc("html output suitable for recoll")
- options.addOption(option_help);
- options.addOption(option_output);
- options.addOption(option_extractor);
- options.addOption(option_html);
+ static void process_arguments(BoilerpipeArguments arguments) {
+ if (arguments.help()) {
- static CommandLine args_parse(Options options, String[] args) throws ParseException {
- CommandLineParser parser = new DefaultParser();
- CommandLine commandline = parser.parse(options, args);
+ static String process_boilerpipe_images(List<Image> images) {
+ Element container = new Element(Tag.valueOf("body"), "");
- static void process_help(BoilerpipeArguments arguments, Options options) {
- if (arguments.help()) {
+ for (Image image : images) {
+ Element child = new Element(Tag.valueOf("img"), "");
+ if (image.getSrc() != null) {
+ child.attr("src", image.getSrc());
+ if (image.getWidth() != null) {
+ child.attr("width", image.getWidth());
+ if (image.getHeight() != null) {
+ child.attr("height", image.getHeight());
+ if (image.getAlt() != null) {
+ child.attr("alt", image.getAlt());
+ container.appendChild(child);
+ return container.html();
static String process_boilerpipe(BoilerpipeArguments arguments)
CheckedInputStream stream = arguments.input_stream();
- BufferedReader reader = new BufferedReader
+ BufferedReader reader_a = new BufferedReader
+ ( new InputStreamReader
+ BufferedReader reader_b = new BufferedReader
- String text = arguments.extractor().getText(reader);
+ BoilerpipeInterface boilerpipe_interface = new BoilerpipeInterface
+ ( arguments.extractor()
+ arguments.input().seek(0);
+ if (arguments.markup()) {
+ output = boilerpipe_interface.html(reader_b);
+ else if (arguments.highlight()) {
+ output = boilerpipe_interface.highlight(reader_b);
+ else if (arguments.images()) {
+ List<Image> images = boilerpipe_interface.images(reader_b);
+ output = process_boilerpipe_images(images);
+ output = boilerpipe_interface.text();
arguments.input().seek(0);
- static String process_html(BoilerpipeArguments arguments)
+ static Document process_wrap_shell(BoilerpipeArguments arguments)
BoilerpipeProcessingException {
- String text = process_boilerpipe(arguments);
+ return document_target;
+ static String process_wrap(BoilerpipeArguments arguments)
+ BoilerpipeProcessingException {
+ Document document = process_wrap_shell(arguments);
+ String contents = process_boilerpipe(arguments);
- Element body = document_target.getElementsByTag("body").first();
+ Element body = document.getElementsByTag("body").first();
- return document_target.toString();
+ return document.toString();
static String ensure_eol_terminated(String s) {
- public static void main(String[] args) {
- Options options = args_build();
+ public static void main(String[] argv) {
- CommandLine commandline = args_parse(options, args);
- BoilerpipeArguments arguments = new BoilerpipeArguments(commandline);
+ BoilerpipeArguments arguments = new BoilerpipeArguments(argv);
- process_help(arguments, options);
+ process_arguments(arguments);
+ String output = process_boilerpipe(arguments);
- if (arguments.html()) {
- text = process_html(arguments);
- text = process_boilerpipe(arguments);
+ if (arguments.wrap()) {
+ output = process_wrap(arguments);
- text = ensure_eol_terminated(text);
+ output = ensure_eol_terminated(output);
- arguments.output().write(text);
+ arguments.output().write(output);
catch (ParseException e) {
- help(options, e.getMessage());