Html2Text.java
/*
* Copyright (C) 2000 - 2024 Silverpeas
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* As a special exception to the terms and conditions of version 3.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* Open Source Software ("FLOSS") applications as described in Silverpeas's
* FLOSS exception. You should have received a copy of the text describing
* the FLOSS exception, and it is also available here:
* "https://www.silverpeas.org/legal/floss_exception.html"
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.silverpeas.components.mailinglist.service.util;
import org.silverpeas.components.mailinglist.service.util.neko.NekoHtmlCleaner;
import org.silverpeas.core.annotation.Bean;
import org.silverpeas.core.annotation.Technical;
import javax.enterprise.inject.Alternative;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.IOException;
import java.io.Reader;
/**
* HTML parser, used to extract text form the HTML.
* @author Emmanuel Hugonnet
* @deprecated use {@link NekoHtmlCleaner} instead
*/
@Alternative
@Technical
@Bean
public class Html2Text extends HTMLEditorKit.ParserCallback implements HtmlCleaner {
private StringBuilder texte = new StringBuilder(1024);
private boolean register = false;
private int inScript = 0;
private boolean hasError = false;
private boolean isFormatTag = false;
private int maxSize = 150;
/**
* Constructor.
*/
public Html2Text() {
}
/**
* Constructor.
* @param maxSize the maximum size of the extracted text.
*/
public Html2Text(int maxSize) {
this.maxSize = maxSize;
}
@Override
public void setSummarySize(int maxSize) {
this.maxSize = maxSize;
}
@Override
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
hasError = false;
if (!register) {
register = HTML.Tag.BODY.equals(t);
}
if (HTML.Tag.SCRIPT.equals(t)) {
register = false;
inScript++;
}
if (HTML.Tag.META.equals(t) || HTML.Tag.OPTION.equals(t) || HTML.Tag.LINK.equals(t)) {
register = false;
}
if (HTML.Tag.I.equals(t) || HTML.Tag.B.equals(t) || HTML.Tag.BIG.equals(t) ||
HTML.Tag.CENTER.equals(t) || HTML.Tag.FONT.equals(t) || HTML.Tag.SMALL.equals(t) ||
HTML.Tag.SPAN.equals(t) || HTML.Tag.U.equals(t) || HTML.Tag.H1.equals(t) ||
HTML.Tag.H2.equals(t) || HTML.Tag.H3.equals(t) || HTML.Tag.H4.equals(t) ||
HTML.Tag.H5.equals(t) || HTML.Tag.H6.equals(t)) {
isFormatTag = true;
}
}
@Override
public void handleError(String errorMsg, int pos) {
hasError = !errorMsg.startsWith("invalid.tagatt");
}
@Override
public void handleEndTag(HTML.Tag t, int pos) {
if (HTML.Tag.SCRIPT.equals(t)) {
register = true;
inScript--;
if (inScript < 0) {
inScript = 0;
}
}
if (HTML.Tag.META.equals(t) || HTML.Tag.OPTION.equals(t) || HTML.Tag.LINK.equals(t)) {
register = true;
}
if (HTML.Tag.I.equals(t) || HTML.Tag.B.equals(t) || HTML.Tag.BIG.equals(t) ||
HTML.Tag.CENTER.equals(t) || HTML.Tag.FONT.equals(t) || HTML.Tag.SMALL.equals(t) ||
HTML.Tag.SPAN.equals(t) || HTML.Tag.U.equals(t) || HTML.Tag.H1.equals(t) ||
HTML.Tag.H2.equals(t) || HTML.Tag.H3.equals(t) || HTML.Tag.H4.equals(t) ||
HTML.Tag.H5.equals(t) || HTML.Tag.H6.equals(t)) {
isFormatTag = false;
}
}
@Override
public void parse(Reader in) throws IOException {
texte = new StringBuilder(1024);
ParserDelegator delegator = new ParserDelegator();
delegator.parse(in, this, true);
}
@Override
public void handleText(char[] text, int pos) {
if (register && inScript <= 0 && !hasError && texte.length() <= maxSize) {
for (final char aText : text) {
if (Character.isLetterOrDigit(aText)) {
texte.append(aText);
} else if (Character.isSpaceChar(aText)) {
texte.append(' ');
} else {
texte.append(aText);
}
}
if (!isFormatTag && !Character.isSpaceChar(text[text.length - 1])) {
texte.append(' ');
}
}
}
@Override
public void handleEndOfLineString(String eol) {
if (texte.length() <= maxSize) {
texte.append(' ');
}
}
@Override
public void handleComment(char[] data, int pos) {
}
@Override
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
}
@Override
public String getSummary() {
String buffer = texte.toString();
buffer = buffer.trim();
buffer = buffer.replaceAll("<[B,b][R,r]>", " ");
buffer = buffer.replaceAll("<[B,b][R,r]/>", " ");
buffer = buffer.replaceAll("\\s[\\s]*", " ");
if (buffer.length() <= maxSize) {
return buffer;
}
return buffer.substring(0, maxSize);
}
}