/******************************************************************************* Scriba EBook Maker Copyright (C) 2011 Senato della Repubblica (http://www.senato.it/) Offices: Ufficio Stampa e internet [1] Servizio dell'Informatica [2] Contributors: Roberto Battistoni (2, roberto.battistoni@senato.it): software engineer and developer Carlo Marchetti (2, carlo.marchetti@senato.it): project manager Marco Tagliavini (1, marco.tagliavini@senato.it): project visionary This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see *******************************************************************************/ package it.senato.areatesti.ebook.ebookmaker.plugin.defaultplugin.boilerpipe; import it.senato.areatesti.ebook.ebookmaker.plugin.defaultplugin.boilerpipe.base.AbstractBoilerPipePlugin; import it.senato.areatesti.ebook.ebookmaker.scf.bean.ContentItem; import it.senato.areatesti.ebook.ebookmaker.scf.bean.base.IItem; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; import de.l3s.boilerpipe.extractors.CommonExtractors; import de.l3s.boilerpipe.sax.HTMLHighlighter; public class ArticleExtractorBPPlugin extends AbstractBoilerPipePlugin { @Override public List elaborateContent(ContentItem content, ArrayList metadataList) throws IOException { // Extracts content's text from the URL URL url = new URL(content.getContentUrl()); String text = null; extractor = CommonExtractors.ARTICLE_EXTRACTOR; hh = HTMLHighlighter.newExtractingInstance(); List clist = intElaborateContent(content, url, text); return clist; } }