OpenXmlPowerTools.Tests.OA/ListItemRetrieverTests.cs - open-xml-powertools - Git at Google

 /***************************************************************************

 Copyright (c) Microsoft Corporation 2012-2015.

 This code is licensed using the Microsoft Public License (Ms-PL).  The text of the license can be found here:

 http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx

 Published at http://OpenXmlDeveloper.org
 Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx

 Developer: Eric White
 Blog: http://www.ericwhite.com
 Twitter: @EricWhiteDev
 Email: eric@ericwhite.com

 ***************************************************************************/

 using System;
 using System.Collections.Generic;
 using System.Drawing;
 using System.Drawing.Imaging;
 using System.IO;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 using System.Xml;
 using System.Xml.Linq;
 using HtmlAgilityPack;
 using DocumentFormat.OpenXml.Packaging;
 using DocumentFormat.OpenXml.Validation;
 using OpenXmlPowerTools;
 using Xunit;

 namespace OxPt
 {
     public class LiTests
     {

         // PowerShell oneliner that generates InlineData for all files in a directory
         // dir | % { '[InlineData("' + $_.Name + '")]' } | clip

         [Theory]
         [InlineData("LIR001-en-US-ordinal.docx")]
         [InlineData("LIR002-en-US-ordinalText.docx")]
         [InlineData("LIR003-en-US-upperLetter.docx")]
         [InlineData("LIR004-en-US-upperRoman.docx")]
         [InlineData("LIR005-fr-FR-cardinalText.docx")]
         [InlineData("LIR006-fr-FR-ordinalText.docx")]
         [InlineData("LIR006-fr-FR-ordinal.docx")]
         // [InlineData("LIR007-ru-RU-ordinalText.docx")]  // todo this fails, the code needs updated.
         [InlineData("LIR008-zh-CH-chineseCountingThousand.docx")]
         [InlineData("LIR009-zh-CN-chineseCounting.docx")]
         [InlineData("LIR010-zh-CN-ideographTraditional.docx")]
         [InlineData("LIR011-en-US-00001.docx")]
         [InlineData("LIR012-en-US-0001.docx")]
         [InlineData("LIR013-en-US-001.docx")]
         [InlineData("LIR014-en-US-01.docx")]
         [InlineData("LIR015-en-US-cardinalText.docx")]
         [InlineData("LIR016-en-US-decimal.docx")]
         [InlineData("LIR017-en-US-decimalEnclosedCircle.docx")]
         [InlineData("LIR018-en-US-decimalZero.docx")]
         [InlineData("LIR019-en-US-lowerLetter.docx")]
         [InlineData("LIR020-en-US-lowerRoman.docx")]
         public void LIR001(string file)
         {
             FileInfo lirFile = new FileInfo(Path.Combine(TestUtil.SourceDir.FullName, file));
             WmlDocument wmlDoc = new WmlDocument(lirFile.FullName);

             var wordHtmlFile = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, lirFile.Name.Replace(".docx", "-Word.html")));
             WordAutomationUtilities.SaveAsHtmlUsingWord(lirFile, wordHtmlFile);

             var ptHtmlFile = ConvertToHtml(lirFile.FullName, TestUtil.TempDir.FullName);
             var fiPtXml = SaveHtmlAsXml(ptHtmlFile);

             // read and write to get the BOM on the file
             var wh = File.ReadAllText(wordHtmlFile.FullName, Encoding.Default);
             File.WriteAllText(wordHtmlFile.FullName, wh, Encoding.UTF8);

             var wordXml = SaveHtmlAsXml(wordHtmlFile);
             CompareNumbering(fiPtXml, wordXml);
         }

         private static void CompareNumbering(FileInfo fiPtXml, FileInfo wordXml)
         {
             char splitChar = '|';

             Console.WriteLine("Comparing {0} to {1}", fiPtXml.Name, wordXml.Name);
             var xdWord = XDocument.Load(wordXml.FullName);
             List<string> wordRawParagraphText = xdWord.Descendants("h2").Select(p => p.DescendantNodes().OfType<XText>().Select(t => t.Value).Aggregate((s, i) => s + i)).ToList();
             var wordTextToCompare = wordRawParagraphText.Where(p => p.Contains(splitChar.ToString())).Select(p => p.Split(splitChar)[0]).ToList();

             var xdPt = XDocument.Load(fiPtXml.FullName);
             XNamespace xhtml = "http://www.w3.org/1999/xhtml";
             List<string> ptRawParagraphText = xdPt.Descendants(xhtml + "h2").Select(p => p.DescendantNodes().OfType<XText>().Select(t => t.Value).Aggregate((s, i) => s + i)).ToList();
             var ptTextToCompare = ptRawParagraphText.Where(p => p.Contains(splitChar.ToString())).Select(p => new
             {
                 ListItem = p.Split(splitChar)[0],
                 ParaText = p.Split(splitChar)[1],
             }).ToList();

             if (!wordTextToCompare.Any())
             {
                 throw new Exception("Internal error - no items selected");
             }
             if (wordTextToCompare.Count() != ptTextToCompare.Count())
             {
                 Assert.True(false);
                 //Console.WriteLine("Error, differing line counts");
                 //Console.WriteLine("Word line count: {0}", wordTextToCompare.Count());
                 //Console.WriteLine("Pt line count: {0}", ptTextToCompare.Count());
                 return;
             }
             var zipped = wordTextToCompare.Zip(ptTextToCompare, (w, p) => new
             {
                 WordText = w,
                 PtText = p.ListItem,
                 ParagraphText = p.ParaText,
             });
             var mismatchList = zipped.Where(z =>
             {
                 var w = z.WordText.Replace('\n', ' ').Trim();
                 var p = z.PtText.Replace('\n', ' ').Trim();
                 var match = w == p;
                 if (match)
                     return false;
                 return true;
             }).Select(z => z.ParagraphText).ToList();
             if (mismatchList.Any())
             {
                 Assert.True(false);
                 //Console.WriteLine("Mismatches: {0}", mismatchList.Count());
                 //foreach (var item in mismatchList.Take(20))
                 //{
                 //    Console.WriteLine(item);
                 //}
             }
         }

         public static FileInfo ConvertToHtml(string file, string outputDirectory)
         {
             var fi = new FileInfo(file);
             byte[] byteArray = File.ReadAllBytes(fi.FullName);
             FileInfo destFileName;
             using (MemoryStream memoryStream = new MemoryStream())
             {
                 memoryStream.Write(byteArray, 0, byteArray.Length);
                 using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true))
                 {
                     destFileName = new FileInfo(fi.Name.Replace(".docx", ".html"));
                     if (outputDirectory != null && outputDirectory != string.Empty)
                     {
                         DirectoryInfo di = new DirectoryInfo(outputDirectory);
                         if (!di.Exists)
                         {
                             throw new OpenXmlPowerToolsException("Output directory does not exist");
                         }
                         destFileName = new FileInfo(Path.Combine(di.FullName, destFileName.Name));
                     }
                     var imageDirectoryName = destFileName.FullName.Substring(0, destFileName.FullName.Length - 5) + "_files";
                     int imageCounter = 0;
                     var pageTitle = (string)wDoc.CoreFilePropertiesPart.GetXDocument().Descendants(DC.title).FirstOrDefault();
                     if (pageTitle == null)
                         pageTitle = fi.FullName;

                     HtmlConverterSettings settings = new HtmlConverterSettings()
                     {
                         PageTitle = pageTitle,
                         FabricateCssClasses = true,
                         CssClassPrefix = "pt-",
                         RestrictToSupportedLanguages = false,
                         RestrictToSupportedNumberingFormats = false,
                         ImageHandler = imageInfo =>
                         {
                             DirectoryInfo localDirInfo = new DirectoryInfo(imageDirectoryName);
                             if (!localDirInfo.Exists)
                                 localDirInfo.Create();
                             ++imageCounter;
                             string extension = imageInfo.ContentType.Split('/')[1].ToLower();
                             ImageFormat imageFormat = null;
                             if (extension == "png")
                             {
                                 // Convert png to jpeg.
                                 extension = "gif";
                                 imageFormat = ImageFormat.Gif;
                             }
                             else if (extension == "gif")
                                 imageFormat = ImageFormat.Gif;
                             else if (extension == "bmp")
                                 imageFormat = ImageFormat.Bmp;
                             else if (extension == "jpeg")
                                 imageFormat = ImageFormat.Jpeg;
                             else if (extension == "tiff")
                             {
                                 // Convert tiff to gif.
                                 extension = "gif";
                                 imageFormat = ImageFormat.Gif;
                             }
                             else if (extension == "x-wmf")
                             {
                                 extension = "wmf";
                                 imageFormat = ImageFormat.Wmf;
                             }

                             // If the image format isn't one that we expect, ignore it,
                             // and don't return markup for the link.
                             if (imageFormat == null)
                                 return null;

                             string imageFileName = imageDirectoryName + "/image" +
                                 imageCounter.ToString() + "." + extension;
                             try
                             {
                                 imageInfo.Bitmap.Save(imageFileName, imageFormat);
                             }
                             catch (System.Runtime.InteropServices.ExternalException)
                             {
                                 return null;
                             }
                             XElement img = new XElement(Xhtml.img,
                                 new XAttribute(NoNamespace.src, imageFileName),
                                 imageInfo.ImgStyleAttribute,
                                 imageInfo.AltText != null ?
                                     new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
                             return img;
                         }
                     };
                     XElement html = HtmlConverter.ConvertToHtml(wDoc, settings);

                     // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type
                     // XEntity.  PtOpenXmlUtil.cs define the XEntity class.  See
                     // http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx
                     // for detailed explanation.
                     //
                     // If you further transform the XML tree returned by ConvertToHtmlTransform, you
                     // must do it correctly, or entities will not be serialized properly.

                     var htmlString = html.ToString(SaveOptions.DisableFormatting);
                     File.WriteAllText(destFileName.FullName, htmlString, Encoding.UTF8);
                 }
             }
             return destFileName;
         }

         public static FileInfo SaveHtmlAsXml(FileInfo htmlFileName)
         {
             string baseName = htmlFileName.Name.Substring(0, htmlFileName.Name.Length - htmlFileName.Extension.Length);
             FileInfo destFile = new FileInfo(Path.Combine(htmlFileName.DirectoryName, baseName + ".xml"));

             HtmlDocument hdoc = new HtmlDocument();
             hdoc.Load(htmlFileName.FullName, Encoding.UTF8);
             hdoc.OptionOutputAsXml = true;
             hdoc.Save(destFile.FullName, Encoding.UTF8);
             StringBuilder sb = new StringBuilder(File.ReadAllText(destFile.FullName, Encoding.Default));
             sb.Replace("è", "&egrave;");
             sb.Replace("&amp;", "&");
             sb.Replace("&nbsp;", "\xA0");
             sb.Replace("&quot;", "\"");
             sb.Replace("&lt;", "~lt;");
             sb.Replace("&gt;", "~gt;");
             sb.Replace("&#", "~#");
             sb.Replace("&", "&amp;");
             sb.Replace("~lt;", "&lt;");
             sb.Replace("~gt;", "&gt;");
             sb.Replace("~#", "&#");
             File.WriteAllText(destFile.FullName, sb.ToString(), Encoding.UTF8);
             return destFile;
         }
     }

 }
	/***************************************************************************

	Copyright (c) Microsoft Corporation 2012-2015.

	This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:

	http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx

	Published at http://OpenXmlDeveloper.org
	Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx

	Developer: Eric White
	Blog: http://www.ericwhite.com
	Twitter: @EricWhiteDev
	Email: eric@ericwhite.com

	***************************************************************************/

	using System;
	using System.Collections.Generic;
	using System.Drawing;
	using System.Drawing.Imaging;
	using System.IO;
	using System.Linq;
	using System.Text;
	using System.Threading.Tasks;
	using System.Xml;
	using System.Xml.Linq;
	using HtmlAgilityPack;
	using DocumentFormat.OpenXml.Packaging;
	using DocumentFormat.OpenXml.Validation;
	using OpenXmlPowerTools;
	using Xunit;

	namespace OxPt
	{
	public class LiTests
	{

	// PowerShell oneliner that generates InlineData for all files in a directory
	// dir \| % { '[InlineData("' + $_.Name + '")]' } \| clip

	[Theory]
	[InlineData("LIR001-en-US-ordinal.docx")]
	[InlineData("LIR002-en-US-ordinalText.docx")]
	[InlineData("LIR003-en-US-upperLetter.docx")]
	[InlineData("LIR004-en-US-upperRoman.docx")]
	[InlineData("LIR005-fr-FR-cardinalText.docx")]
	[InlineData("LIR006-fr-FR-ordinalText.docx")]
	[InlineData("LIR006-fr-FR-ordinal.docx")]
	// [InlineData("LIR007-ru-RU-ordinalText.docx")] // todo this fails, the code needs updated.
	[InlineData("LIR008-zh-CH-chineseCountingThousand.docx")]
	[InlineData("LIR009-zh-CN-chineseCounting.docx")]
	[InlineData("LIR010-zh-CN-ideographTraditional.docx")]
	[InlineData("LIR011-en-US-00001.docx")]
	[InlineData("LIR012-en-US-0001.docx")]
	[InlineData("LIR013-en-US-001.docx")]
	[InlineData("LIR014-en-US-01.docx")]
	[InlineData("LIR015-en-US-cardinalText.docx")]
	[InlineData("LIR016-en-US-decimal.docx")]
	[InlineData("LIR017-en-US-decimalEnclosedCircle.docx")]
	[InlineData("LIR018-en-US-decimalZero.docx")]
	[InlineData("LIR019-en-US-lowerLetter.docx")]
	[InlineData("LIR020-en-US-lowerRoman.docx")]
	public void LIR001(string file)
	{
	FileInfo lirFile = new FileInfo(Path.Combine(TestUtil.SourceDir.FullName, file));
	WmlDocument wmlDoc = new WmlDocument(lirFile.FullName);

	var wordHtmlFile = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, lirFile.Name.Replace(".docx", "-Word.html")));
	WordAutomationUtilities.SaveAsHtmlUsingWord(lirFile, wordHtmlFile);

	var ptHtmlFile = ConvertToHtml(lirFile.FullName, TestUtil.TempDir.FullName);
	var fiPtXml = SaveHtmlAsXml(ptHtmlFile);

	// read and write to get the BOM on the file
	var wh = File.ReadAllText(wordHtmlFile.FullName, Encoding.Default);
	File.WriteAllText(wordHtmlFile.FullName, wh, Encoding.UTF8);

	var wordXml = SaveHtmlAsXml(wordHtmlFile);
	CompareNumbering(fiPtXml, wordXml);
	}

	private static void CompareNumbering(FileInfo fiPtXml, FileInfo wordXml)
	{
	char splitChar = '\|';

	Console.WriteLine("Comparing {0} to {1}", fiPtXml.Name, wordXml.Name);
	var xdWord = XDocument.Load(wordXml.FullName);
	List<string> wordRawParagraphText = xdWord.Descendants("h2").Select(p => p.DescendantNodes().OfType<XText>().Select(t => t.Value).Aggregate((s, i) => s + i)).ToList();
	var wordTextToCompare = wordRawParagraphText.Where(p => p.Contains(splitChar.ToString())).Select(p => p.Split(splitChar)[0]).ToList();

	var xdPt = XDocument.Load(fiPtXml.FullName);
	XNamespace xhtml = "http://www.w3.org/1999/xhtml";
	List<string> ptRawParagraphText = xdPt.Descendants(xhtml + "h2").Select(p => p.DescendantNodes().OfType<XText>().Select(t => t.Value).Aggregate((s, i) => s + i)).ToList();
	var ptTextToCompare = ptRawParagraphText.Where(p => p.Contains(splitChar.ToString())).Select(p => new
	{
	ListItem = p.Split(splitChar)[0],
	ParaText = p.Split(splitChar)[1],
	}).ToList();

	if (!wordTextToCompare.Any())
	{
	throw new Exception("Internal error - no items selected");
	}
	if (wordTextToCompare.Count() != ptTextToCompare.Count())
	{
	Assert.True(false);
	//Console.WriteLine("Error, differing line counts");
	//Console.WriteLine("Word line count: {0}", wordTextToCompare.Count());
	//Console.WriteLine("Pt line count: {0}", ptTextToCompare.Count());
	return;
	}
	var zipped = wordTextToCompare.Zip(ptTextToCompare, (w, p) => new
	{
	WordText = w,
	PtText = p.ListItem,
	ParagraphText = p.ParaText,
	});
	var mismatchList = zipped.Where(z =>
	{
	var w = z.WordText.Replace('\n', ' ').Trim();
	var p = z.PtText.Replace('\n', ' ').Trim();
	var match = w == p;
	if (match)
	return false;
	return true;
	}).Select(z => z.ParagraphText).ToList();
	if (mismatchList.Any())
	{
	Assert.True(false);
	//Console.WriteLine("Mismatches: {0}", mismatchList.Count());
	//foreach (var item in mismatchList.Take(20))
	//{
	// Console.WriteLine(item);
	//}
	}
	}

	public static FileInfo ConvertToHtml(string file, string outputDirectory)
	{
	var fi = new FileInfo(file);
	byte[] byteArray = File.ReadAllBytes(fi.FullName);
	FileInfo destFileName;
	using (MemoryStream memoryStream = new MemoryStream())
	{
	memoryStream.Write(byteArray, 0, byteArray.Length);
	using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true))
	{
	destFileName = new FileInfo(fi.Name.Replace(".docx", ".html"));
	if (outputDirectory != null && outputDirectory != string.Empty)
	{
	DirectoryInfo di = new DirectoryInfo(outputDirectory);
	if (!di.Exists)
	{
	throw new OpenXmlPowerToolsException("Output directory does not exist");
	}
	destFileName = new FileInfo(Path.Combine(di.FullName, destFileName.Name));
	}
	var imageDirectoryName = destFileName.FullName.Substring(0, destFileName.FullName.Length - 5) + "_files";
	int imageCounter = 0;
	var pageTitle = (string)wDoc.CoreFilePropertiesPart.GetXDocument().Descendants(DC.title).FirstOrDefault();
	if (pageTitle == null)
	pageTitle = fi.FullName;

	HtmlConverterSettings settings = new HtmlConverterSettings()
	{
	PageTitle = pageTitle,
	FabricateCssClasses = true,
	CssClassPrefix = "pt-",
	RestrictToSupportedLanguages = false,
	RestrictToSupportedNumberingFormats = false,
	ImageHandler = imageInfo =>
	{
	DirectoryInfo localDirInfo = new DirectoryInfo(imageDirectoryName);
	if (!localDirInfo.Exists)
	localDirInfo.Create();
	++imageCounter;
	string extension = imageInfo.ContentType.Split('/')[1].ToLower();
	ImageFormat imageFormat = null;
	if (extension == "png")
	{
	// Convert png to jpeg.
	extension = "gif";
	imageFormat = ImageFormat.Gif;
	}
	else if (extension == "gif")
	imageFormat = ImageFormat.Gif;
	else if (extension == "bmp")
	imageFormat = ImageFormat.Bmp;
	else if (extension == "jpeg")
	imageFormat = ImageFormat.Jpeg;
	else if (extension == "tiff")
	{
	// Convert tiff to gif.
	extension = "gif";
	imageFormat = ImageFormat.Gif;
	}
	else if (extension == "x-wmf")
	{
	extension = "wmf";
	imageFormat = ImageFormat.Wmf;
	}

	// If the image format isn't one that we expect, ignore it,
	// and don't return markup for the link.
	if (imageFormat == null)
	return null;

	string imageFileName = imageDirectoryName + "/image" +
	imageCounter.ToString() + "." + extension;
	try
	{
	imageInfo.Bitmap.Save(imageFileName, imageFormat);
	}
	catch (System.Runtime.InteropServices.ExternalException)
	{
	return null;
	}
	XElement img = new XElement(Xhtml.img,
	new XAttribute(NoNamespace.src, imageFileName),
	imageInfo.ImgStyleAttribute,
	imageInfo.AltText != null ?
	new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
	return img;
	}
	};
	XElement html = HtmlConverter.ConvertToHtml(wDoc, settings);

	// Note: the xhtml returned by ConvertToHtmlTransform contains objects of type
	// XEntity. PtOpenXmlUtil.cs define the XEntity class. See
	// http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx
	// for detailed explanation.
	//
	// If you further transform the XML tree returned by ConvertToHtmlTransform, you
	// must do it correctly, or entities will not be serialized properly.

	var htmlString = html.ToString(SaveOptions.DisableFormatting);
	File.WriteAllText(destFileName.FullName, htmlString, Encoding.UTF8);
	}
	}
	return destFileName;
	}

	public static FileInfo SaveHtmlAsXml(FileInfo htmlFileName)
	{
	string baseName = htmlFileName.Name.Substring(0, htmlFileName.Name.Length - htmlFileName.Extension.Length);
	FileInfo destFile = new FileInfo(Path.Combine(htmlFileName.DirectoryName, baseName + ".xml"));

	HtmlDocument hdoc = new HtmlDocument();
	hdoc.Load(htmlFileName.FullName, Encoding.UTF8);
	hdoc.OptionOutputAsXml = true;
	hdoc.Save(destFile.FullName, Encoding.UTF8);
	StringBuilder sb = new StringBuilder(File.ReadAllText(destFile.FullName, Encoding.Default));
	sb.Replace("è", "è");
	sb.Replace("&", "&");
	sb.Replace(" ", "\xA0");
	sb.Replace(""", "\"");
	sb.Replace("<", "~lt;");
	sb.Replace(">", "~gt;");
	sb.Replace("&#", "~#");
	sb.Replace("&", "&");
	sb.Replace("~lt;", "<");
	sb.Replace("~gt;", ">");
	sb.Replace("~#", "&#");
	File.WriteAllText(destFile.FullName, sb.ToString(), Encoding.UTF8);
	return destFile;
	}
	}

	}