Html2Text.java

/*
 * Copyright (C) 2000 - 2024 Silverpeas
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * As a special exception to the terms and conditions of version 3.0 of
 * the GPL, you may redistribute this Program in connection with Free/Libre
 * Open Source Software ("FLOSS") applications as described in Silverpeas's
 * FLOSS exception. You should have received a copy of the text describing
 * the FLOSS exception, and it is also available here:
 * "https://www.silverpeas.org/legal/floss_exception.html"
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
 */
package org.silverpeas.components.mailinglist.service.util;

import org.silverpeas.components.mailinglist.service.util.neko.NekoHtmlCleaner;
import org.silverpeas.core.annotation.Bean;
import org.silverpeas.core.annotation.Technical;

import javax.enterprise.inject.Alternative;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.IOException;
import java.io.Reader;

/**
 * HTML parser, used to extract text form the HTML.
 * @author Emmanuel Hugonnet
 * @deprecated use {@link NekoHtmlCleaner} instead
 */
@Alternative
@Technical
@Bean
public class Html2Text extends HTMLEditorKit.ParserCallback implements HtmlCleaner {

  private StringBuilder texte = new StringBuilder(1024);
  private boolean register = false;
  private int inScript = 0;
  private boolean hasError = false;
  private boolean isFormatTag = false;
  private int maxSize = 150;

  /**
   * Constructor.
   */
  public Html2Text() {
  }

  /**
   * Constructor.
   * @param maxSize the maximum size of the extracted text.
   */
  public Html2Text(int maxSize) {
    this.maxSize = maxSize;
  }

  @Override
  public void setSummarySize(int maxSize) {
    this.maxSize = maxSize;
  }

  @Override
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    hasError = false;
    if (!register) {
      register = HTML.Tag.BODY.equals(t);
    }
    if (HTML.Tag.SCRIPT.equals(t)) {
      register = false;
      inScript++;
    }
    if (HTML.Tag.META.equals(t) || HTML.Tag.OPTION.equals(t) || HTML.Tag.LINK.equals(t)) {
      register = false;
    }
    if (HTML.Tag.I.equals(t) || HTML.Tag.B.equals(t) || HTML.Tag.BIG.equals(t) ||
        HTML.Tag.CENTER.equals(t) || HTML.Tag.FONT.equals(t) || HTML.Tag.SMALL.equals(t) ||
        HTML.Tag.SPAN.equals(t) || HTML.Tag.U.equals(t) || HTML.Tag.H1.equals(t) ||
        HTML.Tag.H2.equals(t) || HTML.Tag.H3.equals(t) || HTML.Tag.H4.equals(t) ||
        HTML.Tag.H5.equals(t) || HTML.Tag.H6.equals(t)) {
      isFormatTag = true;
    }
  }

  @Override
  public void handleError(String errorMsg, int pos) {
    hasError = !errorMsg.startsWith("invalid.tagatt");
  }

  @Override
  public void handleEndTag(HTML.Tag t, int pos) {
    if (HTML.Tag.SCRIPT.equals(t)) {
      register = true;
      inScript--;
      if (inScript < 0) {
        inScript = 0;
      }
    }
    if (HTML.Tag.META.equals(t) || HTML.Tag.OPTION.equals(t) || HTML.Tag.LINK.equals(t)) {
      register = true;
    }
    if (HTML.Tag.I.equals(t) || HTML.Tag.B.equals(t) || HTML.Tag.BIG.equals(t) ||
        HTML.Tag.CENTER.equals(t) || HTML.Tag.FONT.equals(t) || HTML.Tag.SMALL.equals(t) ||
        HTML.Tag.SPAN.equals(t) || HTML.Tag.U.equals(t) || HTML.Tag.H1.equals(t) ||
        HTML.Tag.H2.equals(t) || HTML.Tag.H3.equals(t) || HTML.Tag.H4.equals(t) ||
        HTML.Tag.H5.equals(t) || HTML.Tag.H6.equals(t)) {
      isFormatTag = false;
    }
  }

  @Override
  public void parse(Reader in) throws IOException {
    texte = new StringBuilder(1024);
    ParserDelegator delegator = new ParserDelegator();
    delegator.parse(in, this, true);
  }

  @Override
  public void handleText(char[] text, int pos) {
    if (register && inScript <= 0 && !hasError && texte.length() <= maxSize) {
      for (final char aText : text) {
        if (Character.isLetterOrDigit(aText)) {
          texte.append(aText);
        } else if (Character.isSpaceChar(aText)) {
          texte.append(' ');
        } else {
          texte.append(aText);
        }
      }
      if (!isFormatTag && !Character.isSpaceChar(text[text.length - 1])) {
        texte.append(' ');
      }
    }
  }

  @Override
  public void handleEndOfLineString(String eol) {
    if (texte.length() <= maxSize) {
      texte.append(' ');
    }
  }

  @Override
  public void handleComment(char[] data, int pos) {
  }

  @Override
  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
  }

  @Override
  public String getSummary() {
    String buffer = texte.toString();
    buffer = buffer.trim();
    buffer = buffer.replaceAll("<[B,b][R,r]>", " ");
    buffer = buffer.replaceAll("<[B,b][R,r]/>", " ");
    buffer = buffer.replaceAll("\\s[\\s]*", " ");
    if (buffer.length() <= maxSize) {
      return buffer;
    }
    return buffer.substring(0, maxSize);
  }
}