blob: 7b8638953e37542939e66a7bd7d9d34bb42dd046 [file] [log] [blame]
/***************************************************************************
Copyright (c) Microsoft Corporation 2012-2015.
This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:
http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx
Published at http://OpenXmlDeveloper.org
Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx
Developer: Eric White
Blog: http://www.ericwhite.com
Twitter: @EricWhiteDev
Email: eric@ericwhite.com
***************************************************************************/
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Validation;
using OpenXmlPowerTools;
using Xunit;
using System.Text.RegularExpressions;
/*******************************************************************************************
* HtmlToWmlConverter expects the HTML to be passed as an XElement, i.e. as XML. While the HTML test files that
* are included in Open-Xml-PowerTools are able to be read as XML, most HTML is not able to be read as XML.
* The best solution is to use the HtmlAgilityPack, which can parse HTML and save as XML. The HtmlAgilityPack
* is licensed under the Ms-PL (same as Open-Xml-PowerTools) so it is convenient to include it in your solution,
* and thereby you can convert HTML to XML that can be processed by the HtmlToWmlConverter.
*
* A convenient way to get the DLL that has been checked out with HtmlToWmlConverter is to clone the repo at
* https://github.com/EricWhiteDev/HtmlAgilityPack
*
* That repo contains only the DLL that has been checked out with HtmlToWmlConverter.
*
* Of course, you can also get the HtmlAgilityPack source and compile it to get the DLL. You can find it at
* http://codeplex.com/HtmlAgilityPack
*
* We don't include the HtmlAgilityPack in Open-Xml-PowerTools, to simplify installation. The XUnit tests in
* this module do not require the HtmlAgilityPack to run.
*******************************************************************************************/
#if DO_CONVERSION_VIA_WORD
using Word = Microsoft.Office.Interop.Word;
#endif
#if !ELIDE_XUNIT_TESTS
namespace OxPt
{
public class HwTests
{
static bool s_ProduceAnnotatedHtml = true;
// PowerShell oneliner that generates InlineData for all files in a directory
// dir | % { '[InlineData("' + $_.Name + '")]' } | clip
[Theory]
[InlineData("T0010.html")]
[InlineData("T0011.html")]
[InlineData("T0012.html")]
[InlineData("T0013.html")]
[InlineData("T0014.html")]
[InlineData("T0015.html")]
[InlineData("T0020.html")]
[InlineData("T0030.html")]
[InlineData("T0040.html")]
[InlineData("T0050.html")]
[InlineData("T0060.html")]
[InlineData("T0070.html")]
[InlineData("T0080.html")]
[InlineData("T0090.html")]
[InlineData("T0100.html")]
[InlineData("T0110.html")]
[InlineData("T0111.html")]
[InlineData("T0112.html")]
[InlineData("T0120.html")]
[InlineData("T0130.html")]
[InlineData("T0140.html")]
[InlineData("T0150.html")]
[InlineData("T0160.html")]
[InlineData("T0170.html")]
[InlineData("T0180.html")]
[InlineData("T0190.html")]
[InlineData("T0200.html")]
[InlineData("T0210.html")]
[InlineData("T0220.html")]
[InlineData("T0230.html")]
[InlineData("T0240.html")]
[InlineData("T0250.html")]
[InlineData("T0251.html")]
[InlineData("T0260.html")]
[InlineData("T0270.html")]
[InlineData("T0280.html")]
[InlineData("T0290.html")]
[InlineData("T0300.html")]
[InlineData("T0310.html")]
[InlineData("T0320.html")]
[InlineData("T0330.html")]
[InlineData("T0340.html")]
[InlineData("T0350.html")]
[InlineData("T0360.html")]
[InlineData("T0370.html")]
[InlineData("T0380.html")]
[InlineData("T0390.html")]
[InlineData("T0400.html")]
[InlineData("T0410.html")]
[InlineData("T0420.html")]
[InlineData("T0430.html")]
[InlineData("T0431.html")]
[InlineData("T0432.html")]
[InlineData("T0440.html")]
[InlineData("T0450.html")]
[InlineData("T0460.html")]
[InlineData("T0470.html")]
[InlineData("T0480.html")]
[InlineData("T0490.html")]
[InlineData("T0500.html")]
[InlineData("T0510.html")]
[InlineData("T0520.html")]
[InlineData("T0530.html")]
[InlineData("T0540.html")]
[InlineData("T0550.html")]
[InlineData("T0560.html")]
[InlineData("T0570.html")]
[InlineData("T0580.html")]
[InlineData("T0590.html")]
[InlineData("T0600.html")]
[InlineData("T0610.html")]
[InlineData("T0620.html")]
[InlineData("T0622.html")]
[InlineData("T0630.html")]
[InlineData("T0640.html")]
[InlineData("T0650.html")]
[InlineData("T0651.html")]
[InlineData("T0660.html")]
[InlineData("T0670.html")]
[InlineData("T0680.html")]
[InlineData("T0690.html")]
[InlineData("T0691.html")]
[InlineData("T0692.html")]
[InlineData("T0700.html")]
[InlineData("T0710.html")]
[InlineData("T0720.html")]
[InlineData("T0730.html")]
[InlineData("T0740.html")]
[InlineData("T0742.html")]
[InlineData("T0745.html")]
[InlineData("T0750.html")]
[InlineData("T0760.html")]
[InlineData("T0770.html")]
[InlineData("T0780.html")]
[InlineData("T0790.html")]
[InlineData("T0791.html")]
[InlineData("T0792.html")]
[InlineData("T0793.html")]
[InlineData("T0794.html")]
[InlineData("T0795.html")]
[InlineData("T0802.html")]
[InlineData("T0804.html")]
[InlineData("T0805.html")]
[InlineData("T0810.html")]
[InlineData("T0812.html")]
[InlineData("T0814.html")]
[InlineData("T0820.html")]
[InlineData("T0821.html")]
[InlineData("T0830.html")]
[InlineData("T0840.html")]
[InlineData("T0850.html")]
[InlineData("T0851.html")]
[InlineData("T0860.html")]
[InlineData("T0870.html")]
[InlineData("T0880.html")]
[InlineData("T0890.html")]
[InlineData("T0900.html")]
[InlineData("T0910.html")]
[InlineData("T0920.html")]
[InlineData("T0921.html")]
[InlineData("T0922.html")]
[InlineData("T0923.html")]
[InlineData("T0924.html")]
[InlineData("T0925.html")]
[InlineData("T0926.html")]
[InlineData("T0927.html")]
[InlineData("T0928.html")]
[InlineData("T0929.html")]
[InlineData("T0930.html")]
[InlineData("T0931.html")]
[InlineData("T0932.html")]
[InlineData("T0933.html")]
[InlineData("T0934.html")]
[InlineData("T0935.html")]
[InlineData("T0936.html")]
[InlineData("T0940.html")]
[InlineData("T0945.html")]
[InlineData("T0948.html")]
[InlineData("T0950.html")]
[InlineData("T0955.html")]
[InlineData("T0960.html")]
[InlineData("T0968.html")]
[InlineData("T0970.html")]
[InlineData("T0980.html")]
[InlineData("T0990.html")]
[InlineData("T1000.html")]
[InlineData("T1010.html")]
[InlineData("T1020.html")]
[InlineData("T1030.html")]
[InlineData("T1040.html")]
[InlineData("T1050.html")]
[InlineData("T1060.html")]
[InlineData("T1070.html")]
[InlineData("T1080.html")]
[InlineData("T1100.html")]
[InlineData("T1110.html")]
[InlineData("T1111.html")]
[InlineData("T1112.html")]
[InlineData("T1120.html")]
[InlineData("T1130.html")]
[InlineData("T1131.html")]
[InlineData("T1132.html")]
[InlineData("T1140.html")]
[InlineData("T1150.html")]
[InlineData("T1160.html")]
[InlineData("T1170.html")]
[InlineData("T1180.html")]
[InlineData("T1190.html")]
[InlineData("T1200.html")]
[InlineData("T1201.html")]
[InlineData("T1210.html")]
[InlineData("T1220.html")]
[InlineData("T1230.html")]
[InlineData("T1240.html")]
[InlineData("T1241.html")]
[InlineData("T1242.html")]
[InlineData("T1250.html")]
[InlineData("T1251.html")]
[InlineData("T1260.html")]
[InlineData("T1270.html")]
[InlineData("T1280.html")]
[InlineData("T1290.html")]
[InlineData("T1300.html")]
[InlineData("T1310.html")]
[InlineData("T1320.html")]
[InlineData("T1330.html")]
[InlineData("T1340.html")]
[InlineData("T1350.html")]
[InlineData("T1360.html")]
[InlineData("T1370.html")]
[InlineData("T1380.html")]
[InlineData("T1390.html")]
[InlineData("T1400.html")]
[InlineData("T1410.html")]
[InlineData("T1420.html")]
[InlineData("T1430.html")]
[InlineData("T1440.html")]
[InlineData("T1450.html")]
[InlineData("T1460.html")]
[InlineData("T1470.html")]
[InlineData("T1480.html")]
[InlineData("T1490.html")]
[InlineData("T1500.html")]
[InlineData("T1510.html")]
[InlineData("T1520.html")]
[InlineData("T1530.html")]
[InlineData("T1540.html")]
[InlineData("T1550.html")]
[InlineData("T1560.html")]
[InlineData("T1570.html")]
[InlineData("T1580.html")]
[InlineData("T1590.html")]
[InlineData("T1591.html")]
[InlineData("T1610.html")]
[InlineData("T1620.html")]
[InlineData("T1630.html")]
[InlineData("T1640.html")]
[InlineData("T1650.html")]
[InlineData("T1660.html")]
[InlineData("T1670.html")]
[InlineData("T1680.html")]
[InlineData("T1690.html")]
[InlineData("T1700.html")]
[InlineData("T1710.html")]
[InlineData("T1800.html")]
[InlineData("T1810.html")]
[InlineData("T1820.html")]
[InlineData("T1830.html")]
[InlineData("T1840.html")]
[InlineData("T1850.html")]
public void HW001(string name)
{
#if false
string[] cssFilter = new[] {
"text-indent",
"margin-left",
"margin-right",
"padding-left",
"padding-right",
};
#else
string[] cssFilter = null;
#endif
#if false
string[] htmlFilter = new[] {
"img",
};
#else
string[] htmlFilter = null;
#endif
var sourceHtmlFi = new FileInfo(Path.Combine(TestUtil.SourceDir.FullName, name));
var sourceImageDi = new DirectoryInfo(Path.Combine(TestUtil.SourceDir.FullName, sourceHtmlFi.Name.Replace(".html", "_files")));
var destImageDi = new DirectoryInfo(Path.Combine(TestUtil.TempDir.FullName, sourceImageDi.Name));
var sourceCopiedToDestHtmlFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-1-Source.html")));
var destCssFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-2.css")));
var destDocxFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-3-ConvertedByHtmlToWml.docx")));
var annotatedHtmlFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-4-Annotated.txt")));
if (!sourceCopiedToDestHtmlFi.Exists)
File.Copy(sourceHtmlFi.FullName, sourceCopiedToDestHtmlFi.FullName);
XElement html = HtmlToWmlReadAsXElement.ReadAsXElement(sourceCopiedToDestHtmlFi);
string htmlString = html.ToString();
if (htmlFilter != null && htmlFilter.Any())
{
bool found = false;
foreach (var item in htmlFilter)
{
if (htmlString.Contains(item))
{
found = true;
break;
}
}
if (!found)
{
sourceCopiedToDestHtmlFi.Delete();
return;
}
}
string usedAuthorCss = HtmlToWmlConverter.CleanUpCss((string)html.Descendants().FirstOrDefault(d => d.Name.LocalName.ToLower() == "style"));
File.WriteAllText(destCssFi.FullName, usedAuthorCss);
if (cssFilter != null && cssFilter.Any())
{
bool found = false;
foreach (var item in cssFilter)
{
if (usedAuthorCss.Contains(item))
{
found = true;
break;
}
}
if (!found)
{
sourceCopiedToDestHtmlFi.Delete();
destCssFi.Delete();
return;
}
}
if (sourceImageDi.Exists)
{
destImageDi.Create();
foreach (var file in sourceImageDi.GetFiles())
{
File.Copy(file.FullName, destImageDi.FullName + "/" + file.Name);
}
}
HtmlToWmlConverterSettings settings = HtmlToWmlConverter.GetDefaultSettings();
// image references in HTML files contain the path to the subdir that contains the images, so base URI is the name of the directory
// that contains the HTML files
settings.BaseUriForImages = Path.Combine(TestUtil.TempDir.FullName);
WmlDocument doc = HtmlToWmlConverter.ConvertHtmlToWml(defaultCss, usedAuthorCss, userCss, html, settings, null, s_ProduceAnnotatedHtml ? annotatedHtmlFi.FullName : null);
Assert.NotNull(doc);
if (doc != null)
SaveValidateAndFormatMainDocPart(destDocxFi, doc);
#if DO_CONVERSION_VIA_WORD
var newAltChunkBeforeFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".html", "-5-AltChunkBefore.docx")));
var newAltChunkAfterFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".html", "-6-ConvertedViaWord.docx")));
WordAutomationUtilities.DoConversionViaWord(newAltChunkBeforeFi, newAltChunkAfterFi, html);
#endif
}
[Theory]
[InlineData("E0010.html")]
[InlineData("E0020.html")]
public void HW004(string name)
{
var sourceHtmlFi = new FileInfo(Path.Combine(TestUtil.SourceDir.FullName, name));
var sourceImageDi = new DirectoryInfo(Path.Combine(TestUtil.SourceDir.FullName, sourceHtmlFi.Name.Replace(".html", "_files")));
var destImageDi = new DirectoryInfo(Path.Combine(TestUtil.TempDir.FullName, sourceImageDi.Name));
var sourceCopiedToDestHtmlFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-1-Source.html")));
var destCssFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-2.css")));
var destDocxFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-3-ConvertedByHtmlToWml.docx")));
var annotatedHtmlFi = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, sourceHtmlFi.Name.Replace(".html", "-4-Annotated.txt")));
File.Copy(sourceHtmlFi.FullName, sourceCopiedToDestHtmlFi.FullName);
XElement html = HtmlToWmlReadAsXElement.ReadAsXElement(sourceCopiedToDestHtmlFi);
string usedAuthorCss = HtmlToWmlConverter.CleanUpCss((string)html.Descendants().FirstOrDefault(d => d.Name.LocalName.ToLower() == "style"));
File.WriteAllText(destCssFi.FullName, usedAuthorCss);
HtmlToWmlConverterSettings settings = HtmlToWmlConverter.GetDefaultSettings();
settings.BaseUriForImages = Path.Combine(TestUtil.TempDir.FullName);
Assert.Throws<OpenXmlPowerToolsException>(() => HtmlToWmlConverter.ConvertHtmlToWml(defaultCss, usedAuthorCss, userCss, html, settings, null, s_ProduceAnnotatedHtml ? annotatedHtmlFi.FullName : null));
}
private static void SaveValidateAndFormatMainDocPart(FileInfo destDocxFi, WmlDocument doc)
{
WmlDocument formattedDoc;
doc.SaveAs(destDocxFi.FullName);
using (MemoryStream ms = new MemoryStream())
{
ms.Write(doc.DocumentByteArray, 0, doc.DocumentByteArray.Length);
using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true))
{
XDocument xDoc = document.MainDocumentPart.GetXDocument();
document.MainDocumentPart.PutXDocumentWithFormatting();
OpenXmlValidator validator = new OpenXmlValidator();
var errors = validator.Validate(document);
var errorsString = errors
.Select(e => e.Description + Environment.NewLine)
.StringConcatenate();
// Assert that there were no errors in the generated document.
Assert.Equal("", errorsString);
}
formattedDoc = new WmlDocument(destDocxFi.FullName, ms.ToArray());
}
formattedDoc.SaveAs(destDocxFi.FullName);
}
static string defaultCss =
@"html, address,
blockquote,
body, dd, div,
dl, dt, fieldset, form,
frame, frameset,
h1, h2, h3, h4,
h5, h6, noframes,
ol, p, ul, center,
dir, hr, menu, pre { display: block; unicode-bidi: embed }
li { display: list-item }
head { display: none }
table { display: table }
tr { display: table-row }
thead { display: table-header-group }
tbody { display: table-row-group }
tfoot { display: table-footer-group }
col { display: table-column }
colgroup { display: table-column-group }
td, th { display: table-cell }
caption { display: table-caption }
th { font-weight: bolder; text-align: center }
caption { text-align: center }
body { margin: auto; }
h1 { font-size: 2em; margin: auto; }
h2 { font-size: 1.5em; margin: auto; }
h3 { font-size: 1.17em; margin: auto; }
h4, p,
blockquote, ul,
fieldset, form,
ol, dl, dir,
menu { margin: auto }
a { color: blue; }
h5 { font-size: .83em; margin: auto }
h6 { font-size: .75em; margin: auto }
h1, h2, h3, h4,
h5, h6, b,
strong { font-weight: bolder }
blockquote { margin-left: 40px; margin-right: 40px }
i, cite, em,
var, address { font-style: italic }
pre, tt, code,
kbd, samp { font-family: monospace }
pre { white-space: pre }
button, textarea,
input, select { display: inline-block }
big { font-size: 1.17em }
small, sub, sup { font-size: .83em }
sub { vertical-align: sub }
sup { vertical-align: super }
table { border-spacing: 2px; }
thead, tbody,
tfoot { vertical-align: middle }
td, th, tr { vertical-align: inherit }
s, strike, del { text-decoration: line-through }
hr { border: 1px inset }
ol, ul, dir,
menu, dd { margin-left: 40px }
ol { list-style-type: decimal }
ol ul, ul ol,
ul ul, ol ol { margin-top: 0; margin-bottom: 0 }
u, ins { text-decoration: underline }
br:before { content: ""\A""; white-space: pre-line }
center { text-align: center }
:link, :visited { text-decoration: underline }
:focus { outline: thin dotted invert }
/* Begin bidirectionality settings (do not change) */
BDO[DIR=""ltr""] { direction: ltr; unicode-bidi: bidi-override }
BDO[DIR=""rtl""] { direction: rtl; unicode-bidi: bidi-override }
*[DIR=""ltr""] { direction: ltr; unicode-bidi: embed }
*[DIR=""rtl""] { direction: rtl; unicode-bidi: embed }
";
static string userCss = @"";
}
}
#endif