NekoHtmlCleaner.java
/*
* Copyright (C) 2000 - 2024 Silverpeas
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* As a special exception to the terms and conditions of version 3.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* Open Source Software ("FLOSS") applications as described in Silverpeas's
* FLOSS exception. You should have received a copy of the text describing
* the FLOSS exception, and it is also available here:
* "https://www.silverpeas.org/legal/floss_exception.html"
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.silverpeas.components.mailinglist.service.util.neko;
import org.silverpeas.components.mailinglist.service.util.HtmlCleaner;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLTagBalancer;
import org.cyberneko.html.filters.ElementRemover;
import org.silverpeas.core.annotation.Bean;
import org.silverpeas.core.annotation.Technical;
import javax.enterprise.inject.Default;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
@Default
@Technical
@Bean
public class NekoHtmlCleaner implements HtmlCleaner {
private StringWriter content;
private XMLParserConfiguration parser;
private EntityReplaceWriter writer;
private int maxSize = 0;
public NekoHtmlCleaner() {
ElementRemover remover = new NekoElementRemover();
remover.removeElement("script");
remover.removeElement("title");
remover.removeElement("link");
remover.removeElement("meta");
remover.removeElement("select");
remover.acceptElement("br", null);
content = new StringWriter();
writer = new EntityReplaceWriter(content, "UTF-8");
XMLDocumentFilter[] filters = {new HTMLTagBalancer(), remover, writer};
parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
}
@Override
public String getSummary() {
String buffer = content.toString();
buffer = buffer.trim();
buffer = buffer.replaceAll("<[B,b][R,r]>", " ");
buffer = buffer.replaceAll("<[B,b][R,r]/>", " ");
buffer = buffer.replaceAll("\\s[\\s]*", " ");
if (buffer.length() <= maxSize) {
return buffer;
}
return buffer.substring(0, maxSize);
}
@Override
public void parse(Reader in) throws IOException {
content = new StringWriter();
writer.setWriter(content);
XMLInputSource source = new XMLInputSource("-//W3C//DTD HTML 4.01", null, null, in, "UTF-8");
parser.parse(source);
}
@Override
public void setSummarySize(int size) {
this.maxSize = size;
}
}