| /******************************************************************************* | 
 |  * You may amend and distribute as you like, but don't remove this header! | 
 |  * | 
 |  * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets. | 
 |  * See http://www.codeplex.com/EPPlus for details. | 
 |  * | 
 |  * Copyright (C) 2011  Jan Källman | 
 |  * | 
 |  * This library is free software; you can redistribute it and/or | 
 |  * modify it under the terms of the GNU Lesser General Public | 
 |  * License as published by the Free Software Foundation; either | 
 |  * version 2.1 of the License, or (at your option) any later version. | 
 |  | 
 |  * This library is distributed in the hope that it will be useful, | 
 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.   | 
 |  * See the GNU Lesser General Public License for more details. | 
 |  * | 
 |  * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php | 
 |  * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html | 
 |  * | 
 |  * All code and executables are provided "as is" with no warranty either express or implied.  | 
 |  * The author accepts no liability for any damage or loss of business that this product may cause. | 
 |  * | 
 |  * Code change notes: | 
 |  *  | 
 |  * Author				Change						Date | 
 |  * ****************************************************************************** | 
 |  * Richard Tallent		Initial Release				2012-08-13 | 
 |  *******************************************************************************/ | 
 | using System; | 
 | using System.Collections.Generic; | 
 | using System.Text; | 
 | using System.Text.RegularExpressions; | 
 |  | 
 | namespace OfficeOpenXml.Style | 
 | { | 
 | 	public class ExcelRichTextHtmlUtility | 
 | 	{ | 
 |  | 
 | 		/// <summary> | 
 | 		/// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks. | 
 | 		/// HTML support is limited, and does not include font colors, sizes, or typefaces at this time, | 
 | 		/// and also does not support CSS style attributes. It does support line breaks using the BR tag. | 
 | 		/// | 
 | 		/// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT  | 
 | 		/// tag (if any). The tag is parsed to determine the setting change to be applied to the last set | 
 | 		/// of settings, and if the text is not blank, a new block is added to rich text. | 
 | 		/// </summary> | 
 | 		/// <param name="range"></param> | 
 | 		/// <param name="html">The HTML to parse into RichText</param> | 
 | 		/// <param name="defaultFontName"></param> | 
 | 		/// <param name="defaultFontSize"></param> | 
 |  | 
 | 		public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize) | 
 | 		{ | 
 | 			// Reset the cell value, just in case there is an existing RichText value. | 
 | 			range.Value = ""; | 
 |  | 
 | 			// Sanity check for blank values, skips creating Regex objects for performance. | 
 | 			if (String.IsNullOrEmpty(html)) | 
 | 			{ | 
 | 				range.IsRichText = false; | 
 | 				return; | 
 | 			} | 
 |  | 
 | 			// Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/ | 
 | 			// Cells with line breaks aren't necessarily considered rich text, so this is performed | 
 | 			// before parsing the HTML tags. | 
 | 			html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase); | 
 |  | 
 | 			string tag; | 
 | 			string text; | 
 | 			ExcelRichText thisrt = null; | 
 | 			bool isFirst = true; | 
 |  | 
 | 			// Get all pairs of legitimate tags and the text between them. This loop will | 
 | 			// only execute if there is at least one start or end tag. | 
 | 			foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase)) | 
 | 			{ | 
 | 				if (isFirst) | 
 | 				{ | 
 | 					// On the very first match, set up the initial rich text object with | 
 | 					// the defaults for the text BEFORE the match. | 
 | 					range.IsRichText = true; | 
 | 					thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index)));	// May be 0-length | 
 | 					thisrt.Size = defaultFontSize;										// Set the default font size | 
 | 					thisrt.FontName = defaultFontName;									// Set the default font name | 
 | 					isFirst = false; | 
 | 				} | 
 | 				// Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities | 
 | 				// encoded, unencode them, they should be passed to RichText as normal characters (other | 
 | 				// than non-breaking spaces, which should be replaced with normal spaces, they break Excel. | 
 | 				tag = m.Groups[1].Captures[0].Value; | 
 | 				text = CleanText(m.Groups[2].Captures[0].Value); | 
 |  | 
 | 				if (thisrt.Text == "") | 
 | 				{ | 
 | 					// The most recent rich text block wasn't *actually* used last time around, so update | 
 | 					// the text and keep it as the "current" block. This happens with the first block if | 
 | 					// it starts with a tag, and may happen later if tags come one right after the other. | 
 | 					thisrt.Text = text; | 
 | 				} | 
 | 				else | 
 | 				{ | 
 | 					// The current rich text block has some text, so create a new one. RichText.Add() | 
 | 					// automatically applies the settings from the previous block, other than vertical | 
 | 					// alignment. | 
 | 					thisrt = range.RichText.Add(text); | 
 | 				} | 
 | 				// Override the settings based on the current tag, keep all other settings. | 
 | 				SetStyleFromTag(tag, thisrt); | 
 | 			} | 
 |  | 
 | 			if (thisrt == null) | 
 | 			{ | 
 | 				// No HTML tags were found, so treat this as a normal text value. | 
 | 				range.IsRichText = false; | 
 | 				range.Value = CleanText(html); | 
 | 			} | 
 | 			else if (String.IsNullOrEmpty(thisrt.Text)) | 
 | 			{ | 
 | 				// Rich text was found, but the last node contains no text, so remove it. This can happen if, | 
 | 				// say, the end of the string is an end tag or unsupported tag (common). | 
 | 				range.RichText.Remove(thisrt); | 
 |  | 
 | 				// Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text | 
 | 				// directives that remain. If that is the case, turn off rich text and treat this like a blank | 
 | 				// cell value. | 
 | 				if (range.RichText.Count == 0) | 
 | 				{ | 
 | 					range.IsRichText = false; | 
 | 					range.Value = ""; | 
 | 				} | 
 |  | 
 | 			} | 
 |  | 
 | 		} | 
 |  | 
 | 		private static void SetStyleFromTag(string tag, ExcelRichText settings) | 
 | 		{ | 
 | 			switch (tag.ToLower()) | 
 | 			{ | 
 | 				case "b": | 
 | 				case "strong": | 
 | 					settings.Bold = true; | 
 | 					break; | 
 | 				case "i": | 
 | 				case "em": | 
 | 					settings.Italic = true; | 
 | 					break; | 
 | 				case "u": | 
 | 					settings.UnderLine = true; | 
 | 					break; | 
 | 				case "s": | 
 | 				case "strike": | 
 | 					settings.Strike = true; | 
 | 					break; | 
 | 				case "sup": | 
 | 					settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript; | 
 | 					break; | 
 | 				case "sub": | 
 | 					settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript; | 
 | 					break; | 
 | 				case "/b": | 
 | 				case "/strong": | 
 | 					settings.Bold = false; | 
 | 					break; | 
 | 				case "/i": | 
 | 				case "/em": | 
 | 					settings.Italic = false; | 
 | 					break; | 
 | 				case "/u": | 
 | 					settings.UnderLine = false; | 
 | 					break; | 
 | 				case "/s": | 
 | 				case "/strike": | 
 | 					settings.Strike = false; | 
 | 					break; | 
 | 				case "/sup": | 
 | 				case "/sub": | 
 | 					settings.VerticalAlign = ExcelVerticalAlignmentFont.None; | 
 | 					break; | 
 | 				default: | 
 | 					// unsupported HTML, no style change | 
 | 					break; | 
 | 			} | 
 | 		} | 
 |  | 
 | 		private static string CleanText(string s) | 
 | 		{ | 
 | 			// Need to convert HTML entities (named or numbered) into actual Unicode characters | 
 | 			s = System.Web.HttpUtility.HtmlDecode(s); | 
 | 			// Remove any non-breaking spaces, kills Excel | 
 | 			s = s.Replace("\u00A0", " "); | 
 | 			return s; | 
 | 		} | 
 |  | 
 | 	} | 
 | } |