| /*************************************************************************** |
| |
| Copyright (c) Microsoft Corporation 2012-2015. |
| |
| This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here: |
| |
| http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx |
| |
| Published at http://OpenXmlDeveloper.org |
| Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx |
| |
| Developer: Eric White |
| Blog: http://www.ericwhite.com |
| Twitter: @EricWhiteDev |
| Email: eric@ericwhite.com |
| |
| ***************************************************************************/ |
| |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.IO.Packaging; |
| using System.Linq; |
| using System.Text; |
| using System.Xml.Linq; |
| using DocumentFormat.OpenXml.Packaging; |
| using DocumentFormat.OpenXml.Validation; |
| using System.Globalization; |
| |
| namespace OpenXmlPowerTools |
| { |
| public class MetricsGetterSettings |
| { |
| public bool IncludeTextInContentControls; |
| public bool IncludeXlsxTableCellData; |
| public bool RetrieveNamespaceList; |
| public bool RetrieveContentTypeList; |
| } |
| |
| public class MetricsGetter |
| { |
| public static XElement GetMetrics(string fileName, MetricsGetterSettings settings) |
| { |
| FileInfo fi = new FileInfo(fileName); |
| if (!fi.Exists) |
| throw new FileNotFoundException("{0} does not exist.", fi.FullName); |
| if (Util.IsWordprocessingML(fi.Extension)) |
| { |
| WmlDocument wmlDoc = new WmlDocument(fi.FullName, true); |
| return GetDocxMetrics(wmlDoc, settings); |
| } |
| if (Util.IsSpreadsheetML(fi.Extension)) |
| { |
| SmlDocument smlDoc = new SmlDocument(fi.FullName, true); |
| return GetXlsxMetrics(smlDoc, settings); |
| } |
| if (Util.IsPresentationML(fi.Extension)) |
| { |
| PmlDocument pmlDoc = new PmlDocument(fi.FullName, true); |
| return GetPptxMetrics(pmlDoc, settings); |
| } |
| return null; |
| } |
| |
| public static XElement GetDocxMetrics(WmlDocument wmlDoc, MetricsGetterSettings settings) |
| { |
| try |
| { |
| using (MemoryStream ms = new MemoryStream()) |
| { |
| ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); |
| using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true)) |
| { |
| bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document); |
| if (hasTrackedRevisions) |
| RevisionAccepter.AcceptRevisions(document); |
| XElement metrics1 = GetWmlMetrics(wmlDoc.FileName, false, document, settings); |
| if (hasTrackedRevisions) |
| metrics1.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true))); |
| return metrics1; |
| } |
| } |
| } |
| catch (OpenXmlPowerToolsException e) |
| { |
| if (e.ToString().Contains("Invalid Hyperlink")) |
| { |
| using (MemoryStream ms = new MemoryStream()) |
| { |
| ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); |
| #if !NET35 |
| UriFixer.FixInvalidUri(ms, brokenUri => FixUri(brokenUri)); |
| #endif |
| wmlDoc = new WmlDocument("dummy.docx", ms.ToArray()); |
| } |
| using (MemoryStream ms = new MemoryStream()) |
| { |
| ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); |
| using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true)) |
| { |
| bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document); |
| if (hasTrackedRevisions) |
| RevisionAccepter.AcceptRevisions(document); |
| XElement metrics2 = GetWmlMetrics(wmlDoc.FileName, true, document, settings); |
| if (hasTrackedRevisions) |
| metrics2.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true))); |
| return metrics2; |
| } |
| } |
| } |
| } |
| var metrics = new XElement(H.Metrics, |
| new XAttribute(H.FileName, wmlDoc.FileName), |
| new XAttribute(H.FileType, "WordprocessingML"), |
| new XAttribute(H.Error, "Unknown error, metrics not determined")); |
| return metrics; |
| } |
| |
| private static Uri FixUri(string brokenUri) |
| { |
| return new Uri("http://broken-link/"); |
| } |
| |
| private static XElement GetWmlMetrics(string fileName, bool invalidHyperlink, WordprocessingDocument wDoc, MetricsGetterSettings settings) |
| { |
| var parts = new XElement(H.Parts, |
| wDoc.GetAllParts().Select(part => |
| { |
| return GetMetricsForWmlPart(part, settings); |
| })); |
| if (!parts.HasElements) |
| parts = null; |
| var metrics = new XElement(H.Metrics, |
| new XAttribute(H.FileName, fileName), |
| new XAttribute(H.FileType, "WordprocessingML"), |
| GetStyleHierarchy(wDoc), |
| GetMiscWmlMetrics(wDoc, invalidHyperlink), |
| parts, |
| settings.RetrieveNamespaceList ? RetrieveNamespaceList(wDoc) : null, |
| settings.RetrieveContentTypeList ? RetrieveContentTypeList(wDoc) : null |
| ); |
| return metrics; |
| } |
| |
| private static XElement RetrieveContentTypeList(OpenXmlPackage oxPkg) |
| { |
| Package pkg = oxPkg.Package; |
| |
| var nonRelationshipParts = pkg.GetParts().Cast<ZipPackagePart>().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml"); |
| var contentTypes = nonRelationshipParts |
| .Select(p => p.ContentType) |
| .OrderBy(t => t) |
| .Distinct(); |
| var xe = new XElement(H.ContentTypes, |
| contentTypes.Select(ct => new XElement(H.ContentType, new XAttribute(H.Val, ct)))); |
| return xe; |
| } |
| |
| private static XElement RetrieveNamespaceList(OpenXmlPackage oxPkg) |
| { |
| Package pkg = oxPkg.Package; |
| |
| var nonRelationshipParts = pkg.GetParts().Cast<ZipPackagePart>().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml"); |
| var xmlParts = nonRelationshipParts |
| .Where(p => p.ContentType.ToLower().EndsWith("xml")); |
| |
| var uniqueNamespaces = new HashSet<string>(); |
| foreach (var xp in xmlParts) |
| { |
| using (Stream st = xp.GetStream()) |
| { |
| try |
| { |
| XDocument xdoc = XDocument.Load(st); |
| var namespaces = xdoc |
| .Descendants() |
| .Attributes() |
| .Where(a => a.IsNamespaceDeclaration) |
| .Select(a => string.Format("{0}|{1}", a.Name.LocalName, a.Value)) |
| .OrderBy(t => t) |
| .Distinct() |
| .ToList(); |
| foreach (var item in namespaces) |
| uniqueNamespaces.Add(item); |
| } |
| // if catch exception, forget about it. Just trying to get a most complete survey possible of all namespaces in all documents. |
| // if caught exception, chances are the document is bad anyway. |
| catch (Exception) |
| { |
| continue; |
| } |
| } |
| } |
| var xe = new XElement(H.Namespaces, |
| uniqueNamespaces.OrderBy(t => t).Select(n => |
| { |
| var spl = n.Split('|'); |
| return new XElement(H.Namespace, |
| new XAttribute(H.NamespacePrefix, spl[0]), |
| new XAttribute(H.NamespaceName, spl[1])); |
| })); |
| return xe; |
| } |
| |
| private static List<XElement> GetMiscWmlMetrics(WordprocessingDocument document, bool invalidHyperlink) |
| { |
| List<XElement> metrics = new List<XElement>(); |
| List<string> notes = new List<string>(); |
| Dictionary<XName, int> elementCountDictionary = new Dictionary<XName, int>(); |
| |
| if (invalidHyperlink) |
| metrics.Add(new XElement(H.InvalidHyperlink, new XAttribute(H.Val, invalidHyperlink))); |
| |
| bool valid = ValidateWordprocessingDocument(document, metrics, notes, elementCountDictionary); |
| if (invalidHyperlink) |
| valid = false; |
| |
| return metrics; |
| } |
| |
| private static bool ValidateWordprocessingDocument(WordprocessingDocument wDoc, List<XElement> metrics, List<string> notes, Dictionary<XName, int> metricCountDictionary) |
| { |
| bool valid = ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); |
| valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); |
| #if !NET35 |
| valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); |
| #endif |
| |
| int elementCount = 0; |
| int paragraphCount = 0; |
| int textCount = 0; |
| foreach (var part in wDoc.ContentParts()) |
| { |
| XDocument xDoc = part.GetXDocument(); |
| foreach (var e in xDoc.Descendants()) |
| { |
| if (e.Name == W.txbxContent) |
| IncrementMetric(metricCountDictionary, H.TextBox); |
| else if (e.Name == W.sdt) |
| IncrementMetric(metricCountDictionary, H.ContentControl); |
| else if (e.Name == W.customXml) |
| IncrementMetric(metricCountDictionary, H.CustomXmlMarkup); |
| else if (e.Name == W.fldChar) |
| IncrementMetric(metricCountDictionary, H.ComplexField); |
| else if (e.Name == W.fldSimple) |
| IncrementMetric(metricCountDictionary, H.SimpleField); |
| else if (e.Name == W.altChunk) |
| IncrementMetric(metricCountDictionary, H.AltChunk); |
| else if (e.Name == W.tbl) |
| IncrementMetric(metricCountDictionary, H.Table); |
| else if (e.Name == W.hyperlink) |
| IncrementMetric(metricCountDictionary, H.Hyperlink); |
| else if (e.Name == W.framePr) |
| IncrementMetric(metricCountDictionary, H.LegacyFrame); |
| else if (e.Name == W.control) |
| IncrementMetric(metricCountDictionary, H.ActiveX); |
| else if (e.Name == W.subDoc) |
| IncrementMetric(metricCountDictionary, H.SubDocument); |
| else if (e.Name == VML.imagedata || e.Name == VML.fill || e.Name == VML.stroke || e.Name == A.blip) |
| { |
| var relId = (string)e.Attribute(R.embed); |
| if (relId != null) |
| ValidateImageExists(part, relId, metricCountDictionary); |
| relId = (string)e.Attribute(R.pict); |
| if (relId != null) |
| ValidateImageExists(part, relId, metricCountDictionary); |
| relId = (string)e.Attribute(R.id); |
| if (relId != null) |
| ValidateImageExists(part, relId, metricCountDictionary); |
| } |
| |
| if (part.Uri == wDoc.MainDocumentPart.Uri) |
| { |
| elementCount++; |
| if (e.Name == W.p) |
| paragraphCount++; |
| if (e.Name == W.t) |
| textCount += ((string)e).Length; |
| } |
| } |
| } |
| |
| foreach (var item in metricCountDictionary) |
| { |
| metrics.Add( |
| new XElement(item.Key, new XAttribute(H.Val, item.Value))); |
| } |
| |
| metrics.Add(new XElement(H.ElementCount, new XAttribute(H.Val, elementCount))); |
| metrics.Add(new XElement(H.AverageParagraphLength, new XAttribute(H.Val, (int)((double)textCount / (double)paragraphCount)))); |
| |
| if (wDoc.GetAllParts().Any(part => part.ContentType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) |
| metrics.Add(new XElement(H.EmbeddedXlsx, new XAttribute(H.Val, true))); |
| |
| NumberingFormatListAssembly(wDoc, metrics); |
| |
| XDocument wxDoc = wDoc.MainDocumentPart.GetXDocument(); |
| |
| foreach (var d in wxDoc.Descendants()) |
| { |
| if (d.Name == W.saveThroughXslt) |
| { |
| string rid = (string)d.Attribute(R.id); |
| var tempExternalRelationship = wDoc |
| .MainDocumentPart |
| .DocumentSettingsPart |
| .ExternalRelationships |
| .FirstOrDefault(h => h.Id == rid); |
| if (tempExternalRelationship == null) |
| metrics.Add(new XElement(H.InvalidSaveThroughXslt, new XAttribute(H.Val, true))); |
| valid = false; |
| } |
| else if (d.Name == W.trackRevisions) |
| metrics.Add(new XElement(H.TrackRevisionsEnabled, new XAttribute(H.Val, true))); |
| else if (d.Name == W.documentProtection) |
| metrics.Add(new XElement(H.DocumentProtection, new XAttribute(H.Val, true))); |
| } |
| |
| FontAndCharSetAnalysis(wDoc, metrics, notes); |
| |
| return valid; |
| } |
| |
| private static bool ValidateAgainstSpecificVersion(WordprocessingDocument wDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) |
| { |
| OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); |
| var errors = validator.Validate(wDoc); |
| bool valid = errors.Count() == 0; |
| if (!valid) |
| { |
| if (!metrics.Any(e => e.Name == H.SdkValidationError)) |
| metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); |
| metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), |
| errors.Take(3).Select(err => |
| { |
| StringBuilder sb = new StringBuilder(); |
| if (err.Description.Length > 300) |
| sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); |
| else |
| sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); |
| sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); |
| sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); |
| return sb.ToString(); |
| }))); |
| } |
| return valid; |
| } |
| |
| private static bool ValidateAgainstSpecificVersion(SpreadsheetDocument sDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) |
| { |
| OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); |
| var errors = validator.Validate(sDoc); |
| bool valid = errors.Count() == 0; |
| if (!valid) |
| { |
| if (!metrics.Any(e => e.Name == H.SdkValidationError)) |
| metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); |
| metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), |
| errors.Take(3).Select(err => |
| { |
| StringBuilder sb = new StringBuilder(); |
| if (err.Description.Length > 300) |
| sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); |
| else |
| sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); |
| sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); |
| sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); |
| return sb.ToString(); |
| }))); |
| } |
| return valid; |
| } |
| |
| private static bool ValidateAgainstSpecificVersion(PresentationDocument pDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) |
| { |
| OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); |
| var errors = validator.Validate(pDoc); |
| bool valid = errors.Count() == 0; |
| if (!valid) |
| { |
| if (!metrics.Any(e => e.Name == H.SdkValidationError)) |
| metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); |
| metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), |
| errors.Take(3).Select(err => |
| { |
| StringBuilder sb = new StringBuilder(); |
| if (err.Description.Length > 300) |
| sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); |
| else |
| sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); |
| sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); |
| sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); |
| return sb.ToString(); |
| }))); |
| } |
| return valid; |
| } |
| |
| private static void IncrementMetric(Dictionary<XName, int> metricCountDictionary, XName xName) |
| { |
| if (metricCountDictionary.ContainsKey(xName)) |
| metricCountDictionary[xName] = metricCountDictionary[xName] + 1; |
| else |
| metricCountDictionary.Add(xName, 1); |
| } |
| |
| private static void ValidateImageExists(OpenXmlPart part, string relId, Dictionary<XName, int> metrics) |
| { |
| var imagePart = part.Parts.FirstOrDefault(ipp => ipp.RelationshipId == relId); |
| if (imagePart == null) |
| IncrementMetric(metrics, H.ReferenceToNullImage); |
| } |
| |
| |
| private static void NumberingFormatListAssembly(WordprocessingDocument wDoc, List<XElement> metrics) |
| { |
| List<string> numFmtList = new List<string>(); |
| foreach (var part in wDoc.ContentParts()) |
| { |
| var xDoc = part.GetXDocument(); |
| numFmtList = numFmtList.Concat(xDoc |
| .Descendants(W.p) |
| .Select(p => |
| { |
| ListItemRetriever.RetrieveListItem(wDoc, p, null); |
| ListItemRetriever.ListItemInfo lif = p.Annotation<ListItemRetriever.ListItemInfo>(); |
| if (lif != null && lif.IsListItem && lif.Lvl(ListItemRetriever.GetParagraphLevel(p)) != null) |
| { |
| string numFmtForLevel = (string)lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(W.numFmt).Attributes(W.val).FirstOrDefault(); |
| if (numFmtForLevel == null) |
| { |
| var numFmtElement = lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(MC.AlternateContent).Elements(MC.Choice).Elements(W.numFmt).FirstOrDefault(); |
| if (numFmtElement != null && (string)numFmtElement.Attribute(W.val) == "custom") |
| numFmtForLevel = (string)numFmtElement.Attribute(W.format); |
| } |
| return numFmtForLevel; |
| } |
| return null; |
| }) |
| .Where(s => s != null) |
| .Distinct()) |
| .ToList(); |
| } |
| if (numFmtList.Any()) |
| { |
| var nfls = numFmtList.StringConcatenate(s => s + ",").TrimEnd(','); |
| metrics.Add(new XElement(H.NumberingFormatList, new XAttribute(H.Val, PtUtils.MakeValidXml(nfls)))); |
| } |
| } |
| |
| class FormattingMetrics |
| { |
| public int RunCount; |
| public int RunWithoutRprCount; |
| public int ZeroLengthText; |
| public int MultiFontRun; |
| |
| public int AsciiCharCount; |
| public int CSCharCount; |
| public int EastAsiaCharCount; |
| public int HAnsiCharCount; |
| |
| public int AsciiRunCount; |
| public int CSRunCount; |
| public int EastAsiaRunCount; |
| public int HAnsiRunCount; |
| |
| public List<string> Languages; |
| |
| public FormattingMetrics() |
| { |
| Languages = new List<string>(); |
| } |
| } |
| |
| private static void FontAndCharSetAnalysis(WordprocessingDocument wDoc, List<XElement> metrics, List<string> notes) |
| { |
| FormattingAssemblerSettings settings = new FormattingAssemblerSettings |
| { |
| RemoveStyleNamesFromParagraphAndRunProperties = false, |
| ClearStyles = true, |
| RestrictToSupportedNumberingFormats = false, |
| RestrictToSupportedLanguages = false, |
| }; |
| FormattingAssembler.AssembleFormatting(wDoc, settings); |
| var formattingMetrics = new FormattingMetrics(); |
| |
| foreach (var part in wDoc.ContentParts()) |
| { |
| var xDoc = part.GetXDocument(); |
| foreach (var run in xDoc.Descendants(W.r)) |
| { |
| formattingMetrics.RunCount++; |
| AnalyzeRun(run, metrics, notes, formattingMetrics, part.Uri.ToString()); |
| } |
| } |
| |
| metrics.Add(new XElement(H.RunCount, new XAttribute(H.Val, formattingMetrics.RunCount))); |
| if (formattingMetrics.RunWithoutRprCount > 0) |
| metrics.Add(new XElement(H.RunWithoutRprCount, new XAttribute(H.Val, formattingMetrics.RunWithoutRprCount))); |
| if (formattingMetrics.ZeroLengthText > 0) |
| metrics.Add(new XElement(H.ZeroLengthText, new XAttribute(H.Val, formattingMetrics.ZeroLengthText))); |
| if (formattingMetrics.MultiFontRun > 0) |
| metrics.Add(new XElement(H.MultiFontRun, new XAttribute(H.Val, formattingMetrics.MultiFontRun))); |
| if (formattingMetrics.AsciiCharCount > 0) |
| metrics.Add(new XElement(H.AsciiCharCount, new XAttribute(H.Val, formattingMetrics.AsciiCharCount))); |
| if (formattingMetrics.CSCharCount > 0) |
| metrics.Add(new XElement(H.CSCharCount, new XAttribute(H.Val, formattingMetrics.CSCharCount))); |
| if (formattingMetrics.EastAsiaCharCount > 0) |
| metrics.Add(new XElement(H.EastAsiaCharCount, new XAttribute(H.Val, formattingMetrics.EastAsiaCharCount))); |
| if (formattingMetrics.HAnsiCharCount > 0) |
| metrics.Add(new XElement(H.HAnsiCharCount, new XAttribute(H.Val, formattingMetrics.HAnsiCharCount))); |
| if (formattingMetrics.AsciiRunCount > 0) |
| metrics.Add(new XElement(H.AsciiRunCount, new XAttribute(H.Val, formattingMetrics.AsciiRunCount))); |
| if (formattingMetrics.CSRunCount > 0) |
| metrics.Add(new XElement(H.CSRunCount, new XAttribute(H.Val, formattingMetrics.CSRunCount))); |
| if (formattingMetrics.EastAsiaRunCount > 0) |
| metrics.Add(new XElement(H.EastAsiaRunCount, new XAttribute(H.Val, formattingMetrics.EastAsiaRunCount))); |
| if (formattingMetrics.HAnsiRunCount > 0) |
| metrics.Add(new XElement(H.HAnsiRunCount, new XAttribute(H.Val, formattingMetrics.HAnsiRunCount))); |
| |
| if (formattingMetrics.Languages.Any()) |
| { |
| var uls = formattingMetrics.Languages.StringConcatenate(s => s + ",").TrimEnd(','); |
| metrics.Add(new XElement(H.Languages, new XAttribute(H.Val, PtUtils.MakeValidXml(uls)))); |
| } |
| } |
| |
| private static void AnalyzeRun(XElement run, List<XElement> attList, List<string> notes, FormattingMetrics formattingMetrics, string uri) |
| { |
| var runText = run.Elements() |
| .Where(e => e.Name == W.t || e.Name == W.delText) |
| .Select(t => (string)t) |
| .StringConcatenate(); |
| if (runText.Length == 0) |
| { |
| formattingMetrics.ZeroLengthText++; |
| return; |
| } |
| var rPr = run.Element(W.rPr); |
| if (rPr == null) |
| { |
| formattingMetrics.RunWithoutRprCount++; |
| notes.Add(PtUtils.MakeValidXml(string.Format("Error in part {0}: run without rPr at {1}", uri, run.GetXPath()))); |
| rPr = new XElement(W.rPr); |
| } |
| FormattingAssembler.CharStyleAttributes csa = new FormattingAssembler.CharStyleAttributes(null, rPr); |
| var fontTypeArray = runText |
| .Select(ch => FormattingAssembler.DetermineFontTypeFromCharacter(ch, csa)) |
| .ToArray(); |
| var distinctFontTypeArray = fontTypeArray |
| .Distinct() |
| .ToArray(); |
| var distinctFonts = distinctFontTypeArray |
| .Select(ft => |
| { |
| return GetFontFromFontType(csa, ft); |
| }) |
| .Distinct(); |
| var languages = distinctFontTypeArray |
| .Select(ft => |
| { |
| if (ft == FormattingAssembler.FontType.Ascii) |
| return csa.LatinLang; |
| if (ft == FormattingAssembler.FontType.CS) |
| return csa.BidiLang; |
| if (ft == FormattingAssembler.FontType.EastAsia) |
| return csa.EastAsiaLang; |
| //if (ft == FormattingAssembler.FontType.HAnsi) |
| return csa.LatinLang; |
| }) |
| .Select(l => |
| { |
| if (l == "" || l == null) |
| return /* "Dflt:" + */ CultureInfo.CurrentCulture.Name; |
| return l; |
| }) |
| //.Where(l => l != null && l != "") |
| .Distinct(); |
| if (languages.Any(l => !formattingMetrics.Languages.Contains(l))) |
| formattingMetrics.Languages = formattingMetrics.Languages.Concat(languages).Distinct().ToList(); |
| var multiFontRun = distinctFonts.Count() > 1; |
| if (multiFontRun) |
| { |
| formattingMetrics.MultiFontRun++; |
| |
| formattingMetrics.AsciiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.Ascii).Count(); |
| formattingMetrics.CSCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.CS).Count(); |
| formattingMetrics.EastAsiaCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.EastAsia).Count(); |
| formattingMetrics.HAnsiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.HAnsi).Count(); |
| } |
| else |
| { |
| switch (fontTypeArray[0]) |
| { |
| case FormattingAssembler.FontType.Ascii: |
| formattingMetrics.AsciiCharCount += runText.Length; |
| formattingMetrics.AsciiRunCount++; |
| break; |
| case FormattingAssembler.FontType.CS: |
| formattingMetrics.CSCharCount += runText.Length; |
| formattingMetrics.CSRunCount++; |
| break; |
| case FormattingAssembler.FontType.EastAsia: |
| formattingMetrics.EastAsiaCharCount += runText.Length; |
| formattingMetrics.EastAsiaRunCount++; |
| break; |
| case FormattingAssembler.FontType.HAnsi: |
| formattingMetrics.HAnsiCharCount += runText.Length; |
| formattingMetrics.HAnsiRunCount++; |
| break; |
| } |
| } |
| } |
| |
| private static string GetFontFromFontType(FormattingAssembler.CharStyleAttributes csa, FormattingAssembler.FontType ft) |
| { |
| switch (ft) |
| { |
| case FormattingAssembler.FontType.Ascii: |
| return csa.AsciiFont; |
| case FormattingAssembler.FontType.CS: |
| return csa.CsFont; |
| case FormattingAssembler.FontType.EastAsia: |
| return csa.EastAsiaFont; |
| case FormattingAssembler.FontType.HAnsi: |
| return csa.HAnsiFont; |
| default: // dummy |
| return csa.AsciiFont; |
| } |
| } |
| |
| public static XElement GetXlsxMetrics(SmlDocument smlDoc, MetricsGetterSettings settings) |
| { |
| using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(smlDoc)) |
| { |
| using (SpreadsheetDocument sDoc = streamDoc.GetSpreadsheetDocument()) |
| { |
| List<XElement> metrics = new List<XElement>(); |
| |
| bool valid = ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); |
| valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); |
| #if !NET35 |
| valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); |
| #endif |
| |
| return new XElement(H.Metrics, |
| new XAttribute(H.FileName, smlDoc.FileName), |
| new XAttribute(H.FileType, "SpreadsheetML"), |
| metrics, |
| GetTableInfoForWorkbook(sDoc, settings), |
| settings.RetrieveNamespaceList ? RetrieveNamespaceList(sDoc) : null, |
| settings.RetrieveContentTypeList ? RetrieveContentTypeList(sDoc) : null); |
| } |
| } |
| } |
| |
| private static XElement GetTableInfoForWorkbook(SpreadsheetDocument spreadsheet, MetricsGetterSettings settings) |
| { |
| var workbookPart = spreadsheet.WorkbookPart; |
| var xd = workbookPart.GetXDocument(); |
| var partInformation = |
| new XElement(H.Sheets, |
| xd.Root |
| .Element(S.sheets) |
| .Elements(S.sheet) |
| .Select(sh => |
| { |
| var rid = (string)sh.Attribute(R.id); |
| var sheetName = (string)sh.Attribute("name"); |
| WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(rid); |
| return GetTableInfoForSheet(spreadsheet, worksheetPart, sheetName, settings); |
| })); |
| return partInformation; |
| } |
| |
| public static XElement GetTableInfoForSheet(SpreadsheetDocument spreadsheetDocument, WorksheetPart sheetPart, string sheetName, |
| MetricsGetterSettings settings) |
| { |
| var xd = sheetPart.GetXDocument(); |
| XElement sheetInformation = new XElement(H.Sheet, |
| new XAttribute(H.Name, sheetName), |
| xd.Root.Elements(S.tableParts).Elements(S.tablePart).Select(tp => |
| { |
| string rId = (string)tp.Attribute(R.id); |
| TableDefinitionPart tablePart = (TableDefinitionPart)sheetPart.GetPartById(rId); |
| var txd = tablePart.GetXDocument(); |
| var tableName = (string)txd.Root.Attribute("displayName"); |
| XElement tableCellData = null; |
| if (settings.IncludeXlsxTableCellData) |
| { |
| var xlsxTable = spreadsheetDocument.Table(tableName); |
| tableCellData = new XElement(H.TableData, |
| xlsxTable.TableRows() |
| .Select(row => |
| { |
| var rowElement = new XElement(H.Row, |
| xlsxTable.TableColumns().Select(col => |
| { |
| var cellElement = new XElement(H.Cell, |
| new XAttribute(H.Name, col.Name), |
| new XAttribute(H.Val, (string)row[col.Name])); |
| return cellElement; |
| })); |
| return rowElement; |
| })); |
| } |
| var table = new XElement(H.Table, |
| new XAttribute(H.Name, (string)txd.Root.Attribute("name")), |
| new XAttribute(H.DisplayName, tableName), |
| new XElement(H.Columns, |
| txd.Root.Element(S.tableColumns).Elements(S.tableColumn) |
| .Select(tc => new XElement(H.Column, |
| new XAttribute(H.Name, (string)tc.Attribute("name"))))), |
| tableCellData |
| ); |
| return table; |
| }) |
| ); |
| if (!sheetInformation.HasElements) |
| return null; |
| return sheetInformation; |
| } |
| |
| public static XElement GetPptxMetrics(PmlDocument pmlDoc, MetricsGetterSettings settings) |
| { |
| using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(pmlDoc)) |
| { |
| using (PresentationDocument pDoc = streamDoc.GetPresentationDocument()) |
| { |
| List<XElement> metrics = new List<XElement>(); |
| |
| bool valid = ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); |
| valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); |
| #if !NET35 |
| valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); |
| #endif |
| return new XElement(H.Metrics, |
| new XAttribute(H.FileName, pmlDoc.FileName), |
| new XAttribute(H.FileType, "PresentationML"), |
| metrics, |
| settings.RetrieveNamespaceList ? RetrieveNamespaceList(pDoc) : null, |
| settings.RetrieveContentTypeList ? RetrieveContentTypeList(pDoc) : null); |
| } |
| } |
| } |
| |
| private static object GetStyleHierarchy(WordprocessingDocument document) |
| { |
| var stylePart = document.MainDocumentPart.StyleDefinitionsPart; |
| if (stylePart == null) |
| return null; |
| var xd = stylePart.GetXDocument(); |
| var stylesWithPath = xd.Root |
| .Elements(W.style) |
| .Select(s => |
| { |
| var styleString = (string)s.Attribute(W.styleId); |
| var thisStyle = s; |
| while (true) |
| { |
| var baseStyle = (string)thisStyle.Elements(W.basedOn).Attributes(W.val).FirstOrDefault(); |
| if (baseStyle == null) |
| break; |
| styleString = baseStyle + "/" + styleString; |
| thisStyle = xd.Root.Elements(W.style).FirstOrDefault(ts => ts.Attribute(W.styleId).Value == baseStyle); |
| if (thisStyle == null) |
| break; |
| } |
| return styleString; |
| }) |
| .OrderBy(n => n) |
| .ToList(); |
| XElement styleHierarchy = new XElement(H.StyleHierarchy); |
| foreach (var item in stylesWithPath) |
| { |
| var styleChain = item.Split('/'); |
| XElement elementToAddTo = styleHierarchy; |
| foreach (var inChain in styleChain.SkipLast(1)) |
| elementToAddTo = elementToAddTo.Elements(H.Style).FirstOrDefault(z => z.Attribute(H.Id).Value == inChain); |
| var styleToAdd = styleChain.Last(); |
| elementToAddTo.Add( |
| new XElement(H.Style, |
| new XAttribute(H.Id, styleChain.Last()), |
| new XAttribute(H.Type, (string)xd.Root.Elements(W.style).First(z => z.Attribute(W.styleId).Value == styleToAdd).Attribute(W.type)))); |
| } |
| return styleHierarchy; |
| } |
| |
| private static XElement GetMetricsForWmlPart(OpenXmlPart part, MetricsGetterSettings settings) |
| { |
| XElement contentControls = null; |
| if (part is MainDocumentPart || |
| part is HeaderPart || |
| part is FooterPart || |
| part is FootnotesPart || |
| part is EndnotesPart) |
| { |
| var xd = part.GetXDocument(); |
| contentControls = (XElement)GetContentControlsTransform(xd.Root, settings); |
| if (!contentControls.HasElements) |
| contentControls = null; |
| } |
| var partMetrics = new XElement(H.Part, |
| new XAttribute(H.ContentType, part.ContentType), |
| new XAttribute(H.Uri, part.Uri.ToString()), |
| contentControls); |
| if (partMetrics.HasElements) |
| return partMetrics; |
| return null; |
| } |
| |
| private static object GetContentControlsTransform(XNode node, MetricsGetterSettings settings) |
| { |
| XElement element = node as XElement; |
| if (element != null) |
| { |
| if (element == element.Document.Root) |
| return new XElement(H.ContentControls, |
| element.Nodes().Select(n => GetContentControlsTransform(n, settings))); |
| |
| if (element.Name == W.sdt) |
| { |
| var tag = (string)element.Elements(W.sdtPr).Elements(W.tag).Attributes(W.val).FirstOrDefault(); |
| XAttribute tagAttr = tag != null ? new XAttribute(H.Tag, tag) : null; |
| |
| var alias = (string)element.Elements(W.sdtPr).Elements(W.alias).Attributes(W.val).FirstOrDefault(); |
| XAttribute aliasAttr = alias != null ? new XAttribute(H.Alias, alias) : null; |
| |
| var xPathAttr = new XAttribute(H.XPath, element.GetXPath()); |
| |
| var isText = element.Elements(W.sdtPr).Elements(W.text).Any(); |
| var isBibliography = element.Elements(W.sdtPr).Elements(W.bibliography).Any(); |
| var isCitation = element.Elements(W.sdtPr).Elements(W.citation).Any(); |
| var isComboBox = element.Elements(W.sdtPr).Elements(W.comboBox).Any(); |
| var isDate = element.Elements(W.sdtPr).Elements(W.date).Any(); |
| var isDocPartList = element.Elements(W.sdtPr).Elements(W.docPartList).Any(); |
| var isDocPartObj = element.Elements(W.sdtPr).Elements(W.docPartObj).Any(); |
| var isDropDownList = element.Elements(W.sdtPr).Elements(W.dropDownList).Any(); |
| var isEquation = element.Elements(W.sdtPr).Elements(W.equation).Any(); |
| var isGroup = element.Elements(W.sdtPr).Elements(W.group).Any(); |
| var isPicture = element.Elements(W.sdtPr).Elements(W.picture).Any(); |
| var isRichText = element.Elements(W.sdtPr).Elements(W.richText).Any() || |
| (! isText && |
| ! isBibliography && |
| ! isCitation && |
| ! isComboBox && |
| ! isDate && |
| ! isDocPartList && |
| ! isDocPartObj && |
| ! isDropDownList && |
| ! isEquation && |
| ! isGroup && |
| ! isPicture); |
| string type = null; |
| if (isText ) type = "Text"; |
| if (isBibliography) type = "Bibliography"; |
| if (isCitation ) type = "Citation"; |
| if (isComboBox ) type = "ComboBox"; |
| if (isDate ) type = "Date"; |
| if (isDocPartList ) type = "DocPartList"; |
| if (isDocPartObj ) type = "DocPartObj"; |
| if (isDropDownList) type = "DropDownList"; |
| if (isEquation ) type = "Equation"; |
| if (isGroup ) type = "Group"; |
| if (isPicture ) type = "Picture"; |
| if (isRichText ) type = "RichText"; |
| var typeAttr = new XAttribute(H.Type, type); |
| |
| return new XElement(H.ContentControl, |
| typeAttr, |
| tagAttr, |
| aliasAttr, |
| xPathAttr, |
| element.Nodes().Select(n => GetContentControlsTransform(n, settings))); |
| } |
| |
| return element.Nodes().Select(n => GetContentControlsTransform(n, settings)); |
| } |
| if (settings.IncludeTextInContentControls) |
| return node; |
| return null; |
| } |
| } |
| |
| public static class H |
| { |
| public static XName ActiveX = "ActiveX"; |
| public static XName Alias = "Alias"; |
| public static XName AltChunk = "AltChunk"; |
| public static XName Arguments = "Arguments"; |
| public static XName AsciiCharCount = "AsciiCharCount"; |
| public static XName AsciiRunCount = "AsciiRunCount"; |
| public static XName AverageParagraphLength = "AverageParagraphLength"; |
| public static XName BaselineReport = "BaselineReport"; |
| public static XName Batch = "Batch"; |
| public static XName BatchName = "BatchName"; |
| public static XName BatchSelector = "BatchSelector"; |
| public static XName CSCharCount = "CSCharCount"; |
| public static XName CSRunCount = "CSRunCount"; |
| public static XName Catalog = "Catalog"; |
| public static XName CatalogList = "CatalogList"; |
| public static XName CatalogListFile = "CatalogListFile"; |
| public static XName CaughtException = "CaughtException"; |
| public static XName Cell = "Cell"; |
| public static XName Column = "Column"; |
| public static XName Columns = "Columns"; |
| public static XName ComplexField = "ComplexField"; |
| public static XName Computer = "Computer"; |
| public static XName Computers = "Computers"; |
| public static XName ContentControl = "ContentControl"; |
| public static XName ContentControls = "ContentControls"; |
| public static XName ContentType = "ContentType"; |
| public static XName ContentTypes = "ContentTypes"; |
| public static XName CustomXmlMarkup = "CustomXmlMarkup"; |
| public static XName DLL = "DLL"; |
| public static XName DefaultDialogValuesFile = "DefaultDialogValuesFile"; |
| public static XName DefaultValues = "DefaultValues"; |
| public static XName Dependencies = "Dependencies"; |
| public static XName DestinationDir = "DestinationDir"; |
| public static XName Directory = "Directory"; |
| public static XName DirectoryPattern = "DirectoryPattern"; |
| public static XName DisplayName = "DisplayName"; |
| public static XName DoJobQueueName = "DoJobQueueName"; |
| public static XName Document = "Document"; |
| public static XName DocumentProtection = "DocumentProtection"; |
| public static XName DocumentSelector = "DocumentSelector"; |
| public static XName DocumentType = "DocumentType"; |
| public static XName Documents = "Documents"; |
| public static XName EastAsiaCharCount = "EastAsiaCharCount"; |
| public static XName EastAsiaRunCount = "EastAsiaRunCount"; |
| public static XName ElementCount = "ElementCount"; |
| public static XName EmbeddedXlsx = "EmbeddedXlsx"; |
| public static XName Error = "Error"; |
| public static XName Exception = "Exception"; |
| public static XName Exe = "Exe"; |
| public static XName ExeRoot = "ExeRoot"; |
| public static XName Extension = "Extension"; |
| public static XName File = "File"; |
| public static XName FileLength = "FileLength"; |
| public static XName FileName = "FileName"; |
| public static XName FilePattern = "FilePattern"; |
| public static XName FileType = "FileType"; |
| public static XName Guid = "Guid"; |
| public static XName HAnsiCharCount = "HAnsiCharCount"; |
| public static XName HAnsiRunCount = "HAnsiRunCount"; |
| public static XName RevisionTracking = "RevisionTracking"; |
| public static XName Hyperlink = "Hyperlink"; |
| public static XName IPAddress = "IPAddress"; |
| public static XName Id = "Id"; |
| public static XName Invalid = "Invalid"; |
| public static XName InvalidHyperlink = "InvalidHyperlink"; |
| public static XName InvalidHyperlinkException = "InvalidHyperlinkException"; |
| public static XName InvalidSaveThroughXslt = "InvalidSaveThroughXslt"; |
| public static XName JobComplete = "JobComplete"; |
| public static XName JobExe = "JobExe"; |
| public static XName JobName = "JobName"; |
| public static XName JobSpec = "JobSpec"; |
| public static XName Languages = "Languages"; |
| public static XName LegacyFrame = "LegacyFrame"; |
| public static XName LocalDoJobQueue = "LocalDoJobQueue"; |
| public static XName MachineName = "MachineName"; |
| public static XName MaxConcurrentJobs = "MaxConcurrentJobs"; |
| public static XName MaxDocumentsInJob = "MaxDocumentsInJob"; |
| public static XName MaxParagraphLength = "MaxParagraphLength"; |
| public static XName Message = "Message"; |
| public static XName Metrics = "Metrics"; |
| public static XName MultiDirectory = "MultiDirectory"; |
| public static XName MultiFontRun = "MultiFontRun"; |
| public static XName MultiServerQueue = "MultiServerQueue"; |
| public static XName Name = "Name"; |
| public static XName Namespaces = "Namespaces"; |
| public static XName Namespace = "Namespace"; |
| public static XName NamespaceName = "NamespaceName"; |
| public static XName NamespacePrefix = "NamespacePrefix"; |
| public static XName Note = "Note"; |
| public static XName NumberingFormatList = "NumberingFormatList"; |
| public static XName ObjectDisposedException = "ObjectDisposedException"; |
| public static XName ParagraphCount = "ParagraphCount"; |
| public static XName Part = "Part"; |
| public static XName Parts = "Parts"; |
| public static XName PassedDocuments = "PassedDocuments"; |
| public static XName Path = "Path"; |
| public static XName ProduceCatalog = "ProduceCatalog"; |
| public static XName ReferenceToNullImage = "ReferenceToNullImage"; |
| public static XName Report = "Report"; |
| public static XName Root = "Root"; |
| public static XName RootDirectory = "RootDirectory"; |
| public static XName Row = "Row"; |
| public static XName RunCount = "RunCount"; |
| public static XName RunWithoutRprCount = "RunWithoutRprCount"; |
| public static XName SdkValidationError = "SdkValidationError"; |
| public static XName SdkValidationError2007 = "SdkValidationError2007"; |
| public static XName SdkValidationError2010 = "SdkValidationError2010"; |
| public static XName SdkValidationError2013 = "SdkValidationError2013"; |
| public static XName Sheet = "Sheet"; |
| public static XName Sheets = "Sheets"; |
| public static XName SimpleField = "SimpleField"; |
| public static XName Skip = "Skip"; |
| public static XName SmartTag = "SmartTag"; |
| public static XName SourceRootDir = "SourceRootDir"; |
| public static XName SpawnerJobExeLocation = "SpawnerJobExeLocation"; |
| public static XName SpawnerReady = "SpawnerReady"; |
| public static XName Style = "Style"; |
| public static XName StyleHierarchy = "StyleHierarchy"; |
| public static XName SubDocument = "SubDocument"; |
| public static XName Table = "Table"; |
| public static XName TableData = "TableData"; |
| public static XName Tag = "Tag"; |
| public static XName Take = "Take"; |
| public static XName TextBox = "TextBox"; |
| public static XName TrackRevisionsEnabled = "TrackRevisionsEnabled"; |
| public static XName Type = "Type"; |
| public static XName Uri = "Uri"; |
| public static XName Val = "Val"; |
| public static XName Valid = "Valid"; |
| public static XName WindowStyle = "WindowStyle"; |
| public static XName XPath = "XPath"; |
| public static XName ZeroLengthText = "ZeroLengthText"; |
| public static XName custDataLst = "custDataLst"; |
| public static XName custShowLst = "custShowLst"; |
| public static XName kinsoku = "kinsoku"; |
| public static XName modifyVerifier = "modifyVerifier"; |
| public static XName photoAlbum = "photoAlbum"; |
| } |
| } |