/*******************************************************************************
Scriba EBook Maker
Copyright (C) 2011 Senato della Repubblica (http://www.senato.it/)
Offices:
Ufficio Stampa e internet [1]
Servizio dell'Informatica [2]
Contributors:
Roberto Battistoni (2, roberto.battistoni@senato.it): software engineer and developer
Carlo Marchetti (2, carlo.marchetti@senato.it): project manager
Marco Tagliavini (1, marco.tagliavini@senato.it): project visionary
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
*******************************************************************************/
package it.senato.areatesti.ebook.ebookmaker.plugin.defaultplugin.boilerpipe;
import it.senato.areatesti.ebook.ebookmaker.plugin.defaultplugin.boilerpipe.base.AbstractBoilerPipePlugin;
import it.senato.areatesti.ebook.ebookmaker.scf.bean.ContentItem;
import it.senato.areatesti.ebook.ebookmaker.scf.bean.base.IItem;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import de.l3s.boilerpipe.extractors.CommonExtractors;
import de.l3s.boilerpipe.sax.HTMLHighlighter;
public class ArticleExtractorBPPlugin extends AbstractBoilerPipePlugin
{
@Override
public List elaborateContent(ContentItem content,
ArrayList metadataList) throws IOException
{
// Extracts content's text from the URL
URL url = new URL(content.getContentUrl());
String text = null;
extractor = CommonExtractors.ARTICLE_EXTRACTOR;
hh = HTMLHighlighter.newExtractingInstance();
List clist = intElaborateContent(content, url, text);
return clist;
}
}