EPPlus/Style/ExcelRichTextHtmlUtility.cs - epplus - Git at Google

 /*******************************************************************************
  * You may amend and distribute as you like, but don't remove this header!
  *
  * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
  * See http://www.codeplex.com/EPPlus for details.
  *
  * Copyright (C) 2011  Jan Källman
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.

  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU Lesser General Public License for more details.
  *
  * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
  * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
  *
  * All code and executables are provided "as is" with no warranty either express or implied.
  * The author accepts no liability for any damage or loss of business that this product may cause.
  *
  * Code change notes:
  *
  * Author				Change						Date
  * ******************************************************************************
  * Richard Tallent		Initial Release				2012-08-13
  *******************************************************************************/

 using System;
 using System.Text.RegularExpressions;
 using System.Web;

 namespace OfficeOpenXml.Style;

 public class ExcelRichTextHtmlUtility {
   /// <summary>
   /// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
   /// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
   /// and also does not support CSS style attributes. It does support line breaks using the BR tag.
   ///
   /// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
   /// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
   /// of settings, and if the text is not blank, a new block is added to rich text.
   /// </summary>
   /// <param name="range"></param>
   /// <param name="html">The HTML to parse into RichText</param>
   /// <param name="defaultFontName"></param>
   /// <param name="defaultFontSize"></param>
   public static void SetRichTextFromHtml(
       ExcelRange range,
       string html,
       string defaultFontName,
       short defaultFontSize) {
     // Reset the cell value, just in case there is an existing RichText value.
     range.Value = "";

     // Sanity check for blank values, skips creating Regex objects for performance.
     if (String.IsNullOrEmpty(html)) {
       range.IsRichText = false;
       return;
     }

     // Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
     // Cells with line breaks aren't necessarily considered rich text, so this is performed
     // before parsing the HTML tags.
     html = Regex.Replace(
         html,
         "<br[^>]*>",
         "\r\n",
         RegexOptions.Compiled | RegexOptions.IgnoreCase);

     string tag;
     string text;
     ExcelRichText thisrt = null;
     bool isFirst = true;

     // Get all pairs of legitimate tags and the text between them. This loop will
     // only execute if there is at least one start or end tag.
     foreach (Match m in Regex.Matches(
         html,
         @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)",
         RegexOptions.Compiled | RegexOptions.IgnoreCase)) {
       if (isFirst) {
         // On the very first match, set up the initial rich text object with
         // the defaults for the text BEFORE the match.
         range.IsRichText = true;
         thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
         thisrt.Size = defaultFontSize; // Set the default font size
         thisrt.FontName = defaultFontName; // Set the default font name
         isFirst = false;
       }
       // Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
       // encoded, unencode them, they should be passed to RichText as normal characters (other
       // than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
       tag = m.Groups[1].Captures[0].Value;
       text = CleanText(m.Groups[2].Captures[0].Value);

       if (thisrt.Text == "") {
         // The most recent rich text block wasn't *actually* used last time around, so update
         // the text and keep it as the "current" block. This happens with the first block if
         // it starts with a tag, and may happen later if tags come one right after the other.
         thisrt.Text = text;
       } else {
         // The current rich text block has some text, so create a new one. RichText.Add()
         // automatically applies the settings from the previous block, other than vertical
         // alignment.
         thisrt = range.RichText.Add(text);
       }
       // Override the settings based on the current tag, keep all other settings.
       SetStyleFromTag(tag, thisrt);
     }

     if (thisrt == null) {
       // No HTML tags were found, so treat this as a normal text value.
       range.IsRichText = false;
       range.Value = CleanText(html);
     } else if (String.IsNullOrEmpty(thisrt.Text)) {
       // Rich text was found, but the last node contains no text, so remove it. This can happen if,
       // say, the end of the string is an end tag or unsupported tag (common).
       range.RichText.Remove(thisrt);

       // Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
       // directives that remain. If that is the case, turn off rich text and treat this like a blank
       // cell value.
       if (range.RichText.Count == 0) {
         range.IsRichText = false;
         range.Value = "";
       }
     }
   }

   private static void SetStyleFromTag(string tag, ExcelRichText settings) {
     switch (tag.ToLower()) {
       case "b":
       case "strong":
         settings.Bold = true;
         break;
       case "i":
       case "em":
         settings.Italic = true;
         break;
       case "u":
         settings.UnderLine = true;
         break;
       case "s":
       case "strike":
         settings.Strike = true;
         break;
       case "sup":
         settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
         break;
       case "sub":
         settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
         break;
       case "/b":
       case "/strong":
         settings.Bold = false;
         break;
       case "/i":
       case "/em":
         settings.Italic = false;
         break;
       case "/u":
         settings.UnderLine = false;
         break;
       case "/s":
       case "/strike":
         settings.Strike = false;
         break;
       case "/sup":
       case "/sub":
         settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
         break;
     }
   }

   private static string CleanText(string s) {
     // Need to convert HTML entities (named or numbered) into actual Unicode characters
     s = HttpUtility.HtmlDecode(s);
     // Remove any non-breaking spaces, kills Excel
     s = s.Replace("\u00A0", " ");
     return s;
   }
 }
	/*******************************************************************************
	* You may amend and distribute as you like, but don't remove this header!
	*
	* EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
	* See http://www.codeplex.com/EPPlus for details.
	*
	* Copyright (C) 2011 Jan Källman
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.

	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	* See the GNU Lesser General Public License for more details.
	*
	* The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
	* If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
	*
	* All code and executables are provided "as is" with no warranty either express or implied.
	* The author accepts no liability for any damage or loss of business that this product may cause.
	*
	* Code change notes:
	*
	* Author Change Date
	* ******************************************************************************
	* Richard Tallent Initial Release 2012-08-13
	*******************************************************************************/

	using System;
	using System.Text.RegularExpressions;
	using System.Web;

	namespace OfficeOpenXml.Style;

	public class ExcelRichTextHtmlUtility {
	/// <summary>
	/// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
	/// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
	/// and also does not support CSS style attributes. It does support line breaks using the BR tag.
	///
	/// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
	/// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
	/// of settings, and if the text is not blank, a new block is added to rich text.
	/// </summary>
	/// <param name="range"></param>
	/// <param name="html">The HTML to parse into RichText</param>
	/// <param name="defaultFontName"></param>
	/// <param name="defaultFontSize"></param>
	public static void SetRichTextFromHtml(
	ExcelRange range,
	string html,
	string defaultFontName,
	short defaultFontSize) {
	// Reset the cell value, just in case there is an existing RichText value.
	range.Value = "";

	// Sanity check for blank values, skips creating Regex objects for performance.
	if (String.IsNullOrEmpty(html)) {
	range.IsRichText = false;
	return;
	}

	// Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
	// Cells with line breaks aren't necessarily considered rich text, so this is performed
	// before parsing the HTML tags.
	html = Regex.Replace(
	html,
	"<br[^>]*>",
	"\r\n",
	RegexOptions.Compiled \| RegexOptions.IgnoreCase);

	string tag;
	string text;
	ExcelRichText thisrt = null;
	bool isFirst = true;

	// Get all pairs of legitimate tags and the text between them. This loop will
	// only execute if there is at least one start or end tag.
	foreach (Match m in Regex.Matches(
	html,
	@"<(/?[a-z]+)[^<>]>([\s\S]?)(?=</?[a-z]+[^<>]*>\|$)",
	RegexOptions.Compiled \| RegexOptions.IgnoreCase)) {
	if (isFirst) {
	// On the very first match, set up the initial rich text object with
	// the defaults for the text BEFORE the match.
	range.IsRichText = true;
	thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
	thisrt.Size = defaultFontSize; // Set the default font size
	thisrt.FontName = defaultFontName; // Set the default font name
	isFirst = false;
	}
	// Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
	// encoded, unencode them, they should be passed to RichText as normal characters (other
	// than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
	tag = m.Groups[1].Captures[0].Value;
	text = CleanText(m.Groups[2].Captures[0].Value);

	if (thisrt.Text == "") {
	// The most recent rich text block wasn't actually used last time around, so update
	// the text and keep it as the "current" block. This happens with the first block if
	// it starts with a tag, and may happen later if tags come one right after the other.
	thisrt.Text = text;
	} else {
	// The current rich text block has some text, so create a new one. RichText.Add()
	// automatically applies the settings from the previous block, other than vertical
	// alignment.
	thisrt = range.RichText.Add(text);
	}
	// Override the settings based on the current tag, keep all other settings.
	SetStyleFromTag(tag, thisrt);
	}

	if (thisrt == null) {
	// No HTML tags were found, so treat this as a normal text value.
	range.IsRichText = false;
	range.Value = CleanText(html);
	} else if (String.IsNullOrEmpty(thisrt.Text)) {
	// Rich text was found, but the last node contains no text, so remove it. This can happen if,
	// say, the end of the string is an end tag or unsupported tag (common).
	range.RichText.Remove(thisrt);

	// Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
	// directives that remain. If that is the case, turn off rich text and treat this like a blank
	// cell value.
	if (range.RichText.Count == 0) {
	range.IsRichText = false;
	range.Value = "";
	}
	}
	}

	private static void SetStyleFromTag(string tag, ExcelRichText settings) {
	switch (tag.ToLower()) {
	case "b":
	case "strong":
	settings.Bold = true;
	break;
	case "i":
	case "em":
	settings.Italic = true;
	break;
	case "u":
	settings.UnderLine = true;
	break;
	case "s":
	case "strike":
	settings.Strike = true;
	break;
	case "sup":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
	break;
	case "sub":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
	break;
	case "/b":
	case "/strong":
	settings.Bold = false;
	break;
	case "/i":
	case "/em":
	settings.Italic = false;
	break;
	case "/u":
	settings.UnderLine = false;
	break;
	case "/s":
	case "/strike":
	settings.Strike = false;
	break;
	case "/sup":
	case "/sub":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
	break;
	}
	}

	private static string CleanText(string s) {
	// Need to convert HTML entities (named or numbered) into actual Unicode characters
	s = HttpUtility.HtmlDecode(s);
	// Remove any non-breaking spaces, kills Excel
	s = s.Replace("\u00A0", " ");
	return s;
	}
	}