| /******************************************************************************* |
| * You may amend and distribute as you like, but don't remove this header! |
| * |
| * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets. |
| * See http://www.codeplex.com/EPPlus for details. |
| * |
| * Copyright (C) 2011 Jan Källman |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| * See the GNU Lesser General Public License for more details. |
| * |
| * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php |
| * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html |
| * |
| * All code and executables are provided "as is" with no warranty either express or implied. |
| * The author accepts no liability for any damage or loss of business that this product may cause. |
| * |
| * Code change notes: |
| * |
| * Author Change Date |
| * ****************************************************************************** |
| * Richard Tallent Initial Release 2012-08-13 |
| *******************************************************************************/ |
| using System; |
| using System.Collections.Generic; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace OfficeOpenXml.Style |
| { |
| public class ExcelRichTextHtmlUtility |
| { |
| |
| /// <summary> |
| /// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks. |
| /// HTML support is limited, and does not include font colors, sizes, or typefaces at this time, |
| /// and also does not support CSS style attributes. It does support line breaks using the BR tag. |
| /// |
| /// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT |
| /// tag (if any). The tag is parsed to determine the setting change to be applied to the last set |
| /// of settings, and if the text is not blank, a new block is added to rich text. |
| /// </summary> |
| /// <param name="range"></param> |
| /// <param name="html">The HTML to parse into RichText</param> |
| /// <param name="defaultFontName"></param> |
| /// <param name="defaultFontSize"></param> |
| |
| public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize) |
| { |
| // Reset the cell value, just in case there is an existing RichText value. |
| range.Value = ""; |
| |
| // Sanity check for blank values, skips creating Regex objects for performance. |
| if (String.IsNullOrEmpty(html)) |
| { |
| range.IsRichText = false; |
| return; |
| } |
| |
| // Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/ |
| // Cells with line breaks aren't necessarily considered rich text, so this is performed |
| // before parsing the HTML tags. |
| html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase); |
| |
| string tag; |
| string text; |
| ExcelRichText thisrt = null; |
| bool isFirst = true; |
| |
| // Get all pairs of legitimate tags and the text between them. This loop will |
| // only execute if there is at least one start or end tag. |
| foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase)) |
| { |
| if (isFirst) |
| { |
| // On the very first match, set up the initial rich text object with |
| // the defaults for the text BEFORE the match. |
| range.IsRichText = true; |
| thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length |
| thisrt.Size = defaultFontSize; // Set the default font size |
| thisrt.FontName = defaultFontName; // Set the default font name |
| isFirst = false; |
| } |
| // Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities |
| // encoded, unencode them, they should be passed to RichText as normal characters (other |
| // than non-breaking spaces, which should be replaced with normal spaces, they break Excel. |
| tag = m.Groups[1].Captures[0].Value; |
| text = CleanText(m.Groups[2].Captures[0].Value); |
| |
| if (thisrt.Text == "") |
| { |
| // The most recent rich text block wasn't *actually* used last time around, so update |
| // the text and keep it as the "current" block. This happens with the first block if |
| // it starts with a tag, and may happen later if tags come one right after the other. |
| thisrt.Text = text; |
| } |
| else |
| { |
| // The current rich text block has some text, so create a new one. RichText.Add() |
| // automatically applies the settings from the previous block, other than vertical |
| // alignment. |
| thisrt = range.RichText.Add(text); |
| } |
| // Override the settings based on the current tag, keep all other settings. |
| SetStyleFromTag(tag, thisrt); |
| } |
| |
| if (thisrt == null) |
| { |
| // No HTML tags were found, so treat this as a normal text value. |
| range.IsRichText = false; |
| range.Value = CleanText(html); |
| } |
| else if (String.IsNullOrEmpty(thisrt.Text)) |
| { |
| // Rich text was found, but the last node contains no text, so remove it. This can happen if, |
| // say, the end of the string is an end tag or unsupported tag (common). |
| range.RichText.Remove(thisrt); |
| |
| // Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text |
| // directives that remain. If that is the case, turn off rich text and treat this like a blank |
| // cell value. |
| if (range.RichText.Count == 0) |
| { |
| range.IsRichText = false; |
| range.Value = ""; |
| } |
| |
| } |
| |
| } |
| |
| private static void SetStyleFromTag(string tag, ExcelRichText settings) |
| { |
| switch (tag.ToLower()) |
| { |
| case "b": |
| case "strong": |
| settings.Bold = true; |
| break; |
| case "i": |
| case "em": |
| settings.Italic = true; |
| break; |
| case "u": |
| settings.UnderLine = true; |
| break; |
| case "s": |
| case "strike": |
| settings.Strike = true; |
| break; |
| case "sup": |
| settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript; |
| break; |
| case "sub": |
| settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript; |
| break; |
| case "/b": |
| case "/strong": |
| settings.Bold = false; |
| break; |
| case "/i": |
| case "/em": |
| settings.Italic = false; |
| break; |
| case "/u": |
| settings.UnderLine = false; |
| break; |
| case "/s": |
| case "/strike": |
| settings.Strike = false; |
| break; |
| case "/sup": |
| case "/sub": |
| settings.VerticalAlign = ExcelVerticalAlignmentFont.None; |
| break; |
| default: |
| // unsupported HTML, no style change |
| break; |
| } |
| } |
| |
| private static string CleanText(string s) |
| { |
| // Need to convert HTML entities (named or numbered) into actual Unicode characters |
| s = System.Web.HttpUtility.HtmlDecode(s); |
| // Remove any non-breaking spaces, kills Excel |
| s = s.Replace("\u00A0", " "); |
| return s; |
| } |
| |
| } |
| } |