EPPlus/Style/ExcelRichTextHtmlUtility.cs - epplus - Git at Google

 /*******************************************************************************
  * You may amend and distribute as you like, but don't remove this header!
  *
  * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
  * See http://www.codeplex.com/EPPlus for details.
  *
  * Copyright (C) 2011  Jan Källman
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.

  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU Lesser General Public License for more details.
  *
  * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
  * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
  *
  * All code and executables are provided "as is" with no warranty either express or implied.
  * The author accepts no liability for any damage or loss of business that this product may cause.
  *
  * Code change notes:
  *
  * Author				Change						Date
  * ******************************************************************************
  * Richard Tallent		Initial Release				2012-08-13
  *******************************************************************************/
 using System;
 using System.Collections.Generic;
 using System.Text;
 using System.Text.RegularExpressions;

 namespace OfficeOpenXml.Style
 {
 	public class ExcelRichTextHtmlUtility
 	{

 		/// <summary>
 		/// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
 		/// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
 		/// and also does not support CSS style attributes. It does support line breaks using the BR tag.
 		///
 		/// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
 		/// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
 		/// of settings, and if the text is not blank, a new block is added to rich text.
 		/// </summary>
 		/// <param name="range"></param>
 		/// <param name="html">The HTML to parse into RichText</param>
 		/// <param name="defaultFontName"></param>
 		/// <param name="defaultFontSize"></param>

 		public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize)
 		{
 			// Reset the cell value, just in case there is an existing RichText value.
 			range.Value = "";

 			// Sanity check for blank values, skips creating Regex objects for performance.
 			if (String.IsNullOrEmpty(html))
 			{
 				range.IsRichText = false;
 				return;
 			}

 			// Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
 			// Cells with line breaks aren't necessarily considered rich text, so this is performed
 			// before parsing the HTML tags.
 			html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase);

 			string tag;
 			string text;
 			ExcelRichText thisrt = null;
 			bool isFirst = true;

 			// Get all pairs of legitimate tags and the text between them. This loop will
 			// only execute if there is at least one start or end tag.
 			foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase))
 			{
 				if (isFirst)
 				{
 					// On the very first match, set up the initial rich text object with
 					// the defaults for the text BEFORE the match.
 					range.IsRichText = true;
 					thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index)));	// May be 0-length
 					thisrt.Size = defaultFontSize;										// Set the default font size
 					thisrt.FontName = defaultFontName;									// Set the default font name
 					isFirst = false;
 				}
 				// Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
 				// encoded, unencode them, they should be passed to RichText as normal characters (other
 				// than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
 				tag = m.Groups[1].Captures[0].Value;
 				text = CleanText(m.Groups[2].Captures[0].Value);

 				if (thisrt.Text == "")
 				{
 					// The most recent rich text block wasn't *actually* used last time around, so update
 					// the text and keep it as the "current" block. This happens with the first block if
 					// it starts with a tag, and may happen later if tags come one right after the other.
 					thisrt.Text = text;
 				}
 				else
 				{
 					// The current rich text block has some text, so create a new one. RichText.Add()
 					// automatically applies the settings from the previous block, other than vertical
 					// alignment.
 					thisrt = range.RichText.Add(text);
 				}
 				// Override the settings based on the current tag, keep all other settings.
 				SetStyleFromTag(tag, thisrt);
 			}

 			if (thisrt == null)
 			{
 				// No HTML tags were found, so treat this as a normal text value.
 				range.IsRichText = false;
 				range.Value = CleanText(html);
 			}
 			else if (String.IsNullOrEmpty(thisrt.Text))
 			{
 				// Rich text was found, but the last node contains no text, so remove it. This can happen if,
 				// say, the end of the string is an end tag or unsupported tag (common).
 				range.RichText.Remove(thisrt);

 				// Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
 				// directives that remain. If that is the case, turn off rich text and treat this like a blank
 				// cell value.
 				if (range.RichText.Count == 0)
 				{
 					range.IsRichText = false;
 					range.Value = "";
 				}

 			}

 		}

 		private static void SetStyleFromTag(string tag, ExcelRichText settings)
 		{
 			switch (tag.ToLower())
 			{
 				case "b":
 				case "strong":
 					settings.Bold = true;
 					break;
 				case "i":
 				case "em":
 					settings.Italic = true;
 					break;
 				case "u":
 					settings.UnderLine = true;
 					break;
 				case "s":
 				case "strike":
 					settings.Strike = true;
 					break;
 				case "sup":
 					settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
 					break;
 				case "sub":
 					settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
 					break;
 				case "/b":
 				case "/strong":
 					settings.Bold = false;
 					break;
 				case "/i":
 				case "/em":
 					settings.Italic = false;
 					break;
 				case "/u":
 					settings.UnderLine = false;
 					break;
 				case "/s":
 				case "/strike":
 					settings.Strike = false;
 					break;
 				case "/sup":
 				case "/sub":
 					settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
 					break;
 				default:
 					// unsupported HTML, no style change
 					break;
 			}
 		}

 		private static string CleanText(string s)
 		{
 			// Need to convert HTML entities (named or numbered) into actual Unicode characters
 			s = System.Web.HttpUtility.HtmlDecode(s);
 			// Remove any non-breaking spaces, kills Excel
 			s = s.Replace("\u00A0", " ");
 			return s;
 		}

 	}
 }
	/*******************************************************************************
	* You may amend and distribute as you like, but don't remove this header!
	*
	* EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
	* See http://www.codeplex.com/EPPlus for details.
	*
	* Copyright (C) 2011 Jan Källman
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.

	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	* See the GNU Lesser General Public License for more details.
	*
	* The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
	* If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
	*
	* All code and executables are provided "as is" with no warranty either express or implied.
	* The author accepts no liability for any damage or loss of business that this product may cause.
	*
	* Code change notes:
	*
	* Author Change Date
	* ******************************************************************************
	* Richard Tallent Initial Release 2012-08-13
	*******************************************************************************/
	using System;
	using System.Collections.Generic;
	using System.Text;
	using System.Text.RegularExpressions;

	namespace OfficeOpenXml.Style
	{
	public class ExcelRichTextHtmlUtility
	{

	/// <summary>
	/// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
	/// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
	/// and also does not support CSS style attributes. It does support line breaks using the BR tag.
	///
	/// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
	/// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
	/// of settings, and if the text is not blank, a new block is added to rich text.
	/// </summary>
	/// <param name="range"></param>
	/// <param name="html">The HTML to parse into RichText</param>
	/// <param name="defaultFontName"></param>
	/// <param name="defaultFontSize"></param>

	public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize)
	{
	// Reset the cell value, just in case there is an existing RichText value.
	range.Value = "";

	// Sanity check for blank values, skips creating Regex objects for performance.
	if (String.IsNullOrEmpty(html))
	{
	range.IsRichText = false;
	return;
	}

	// Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
	// Cells with line breaks aren't necessarily considered rich text, so this is performed
	// before parsing the HTML tags.
	html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled \| RegexOptions.IgnoreCase);

	string tag;
	string text;
	ExcelRichText thisrt = null;
	bool isFirst = true;

	// Get all pairs of legitimate tags and the text between them. This loop will
	// only execute if there is at least one start or end tag.
	foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]>([\s\S]?)(?=</?[a-z]+[^<>]*>\|$)", RegexOptions.Compiled \| RegexOptions.IgnoreCase))
	{
	if (isFirst)
	{
	// On the very first match, set up the initial rich text object with
	// the defaults for the text BEFORE the match.
	range.IsRichText = true;
	thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
	thisrt.Size = defaultFontSize; // Set the default font size
	thisrt.FontName = defaultFontName; // Set the default font name
	isFirst = false;
	}
	// Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
	// encoded, unencode them, they should be passed to RichText as normal characters (other
	// than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
	tag = m.Groups[1].Captures[0].Value;
	text = CleanText(m.Groups[2].Captures[0].Value);

	if (thisrt.Text == "")
	{
	// The most recent rich text block wasn't actually used last time around, so update
	// the text and keep it as the "current" block. This happens with the first block if
	// it starts with a tag, and may happen later if tags come one right after the other.
	thisrt.Text = text;
	}
	else
	{
	// The current rich text block has some text, so create a new one. RichText.Add()
	// automatically applies the settings from the previous block, other than vertical
	// alignment.
	thisrt = range.RichText.Add(text);
	}
	// Override the settings based on the current tag, keep all other settings.
	SetStyleFromTag(tag, thisrt);
	}

	if (thisrt == null)
	{
	// No HTML tags were found, so treat this as a normal text value.
	range.IsRichText = false;
	range.Value = CleanText(html);
	}
	else if (String.IsNullOrEmpty(thisrt.Text))
	{
	// Rich text was found, but the last node contains no text, so remove it. This can happen if,
	// say, the end of the string is an end tag or unsupported tag (common).
	range.RichText.Remove(thisrt);

	// Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
	// directives that remain. If that is the case, turn off rich text and treat this like a blank
	// cell value.
	if (range.RichText.Count == 0)
	{
	range.IsRichText = false;
	range.Value = "";
	}

	}

	}

	private static void SetStyleFromTag(string tag, ExcelRichText settings)
	{
	switch (tag.ToLower())
	{
	case "b":
	case "strong":
	settings.Bold = true;
	break;
	case "i":
	case "em":
	settings.Italic = true;
	break;
	case "u":
	settings.UnderLine = true;
	break;
	case "s":
	case "strike":
	settings.Strike = true;
	break;
	case "sup":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
	break;
	case "sub":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
	break;
	case "/b":
	case "/strong":
	settings.Bold = false;
	break;
	case "/i":
	case "/em":
	settings.Italic = false;
	break;
	case "/u":
	settings.UnderLine = false;
	break;
	case "/s":
	case "/strike":
	settings.Strike = false;
	break;
	case "/sup":
	case "/sub":
	settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
	break;
	default:
	// unsupported HTML, no style change
	break;
	}
	}

	private static string CleanText(string s)
	{
	// Need to convert HTML entities (named or numbered) into actual Unicode characters
	s = System.Web.HttpUtility.HtmlDecode(s);
	// Remove any non-breaking spaces, kills Excel
	s = s.Replace("\u00A0", " ");
	return s;
	}

	}
	}