blob: 363878f7409d52055a7615326f1635944b12e61c [file] [log] [blame]
/***************************************************************************
Copyright (c) Microsoft Corporation 2012-2015.
This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:
http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx
Published at http://OpenXmlDeveloper.org
Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx
Developer: Eric White
Blog: http://www.ericwhite.com
Twitter: @EricWhiteDev
Email: eric@ericwhite.com
***************************************************************************/
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
using System.Xml.Linq;
using OpenXmlPowerTools;
/*******************************************************************************************
* HtmlToWmlConverter expects the HTML to be passed as an XElement, i.e. as XML. While the HTML test files that
* are included in Open-Xml-PowerTools are able to be read as XML, most HTML is not able to be read as XML.
* The best solution is to use the HtmlAgilityPack, which can parse HTML and save as XML. The HtmlAgilityPack
* is licensed under the Ms-PL (same as Open-Xml-PowerTools) so it is convenient to include it in your solution,
* and thereby you can convert HTML to XML that can be processed by the HtmlToWmlConverter.
*
* A convenient way to get the DLL that has been checked out with HtmlToWmlConverter is to clone the repo at
* https://github.com/EricWhiteDev/HtmlAgilityPack
*
* That repo contains only the DLL that has been checked out with HtmlToWmlConverter.
*
* Of course, you can also get the HtmlAgilityPack source and compile it to get the DLL. You can find it at
* http://codeplex.com/HtmlAgilityPack
*
* We don't include the HtmlAgilityPack in Open-Xml-PowerTools, to simplify installation. The XUnit tests in
* this module do not require the HtmlAgilityPack to run.
*******************************************************************************************/
#if USE_HTMLAGILITYPACK
using HtmlAgilityPack;
#endif
namespace OpenXmlPowerTools
{
public class HtmlToWmlReadAsXElement
{
public static XElement ReadAsXElement(FileInfo sourceHtmlFi)
{
string htmlString = File.ReadAllText(sourceHtmlFi.FullName);
XElement html = null;
try
{
html = XElement.Parse(htmlString);
}
#if USE_HTMLAGILITYPACK
catch (XmlException)
{
HtmlDocument hdoc = new HtmlDocument();
hdoc.Load(sourceHtmlFi.FullName, Encoding.Default);
hdoc.OptionOutputAsXml = true;
hdoc.Save(sourceHtmlFi.FullName, Encoding.Default);
StringBuilder sb = new StringBuilder(File.ReadAllText(sourceHtmlFi.FullName, Encoding.Default));
sb.Replace("&", "&");
sb.Replace(" ", "\xA0");
sb.Replace(""", "\"");
sb.Replace("<", "~lt;");
sb.Replace(">", "~gt;");
sb.Replace("&#", "~#");
sb.Replace("&", "&");
sb.Replace("~lt;", "<");
sb.Replace("~gt;", ">");
sb.Replace("~#", "&#");
File.WriteAllText(sourceHtmlFi.FullName, sb.ToString(), Encoding.Default);
html = XElement.Parse(sb.ToString());
}
#else
catch (XmlException e)
{
throw e;
}
#endif
html = (XElement)ConvertToNoNamespace(html);
return html;
}
private static object ConvertToNoNamespace(XNode node)
{
XElement element = node as XElement;
if (element != null)
{
return new XElement(element.Name.LocalName,
element.Attributes().Where(a => !a.IsNamespaceDeclaration),
element.Nodes().Select(n => ConvertToNoNamespace(n)));
}
return node;
}
}
}