blob: 1ad01246037f5f679f4f73cd081f863c3cdc55fa [file] [log] [blame]
/***************************************************************************
Copyright (c) Microsoft Corporation 2012-2015.
This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:
http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx
Published at http://OpenXmlDeveloper.org
Resource Center and Documentation: http://openxmldeveloper.org/wiki/w/wiki/powertools-for-open-xml.aspx
Developer: Eric White
Blog: http://www.ericwhite.com
Twitter: @EricWhiteDev
Email: eric@ericwhite.com
***************************************************************************/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace OpenXmlPowerTools
{
public class OpenXmlRegex
{
private const string DontConsolidate = "DontConsolidate";
private static readonly HashSet<XName> RevTrackMarkupWithId = new HashSet<XName>
{
W.cellDel,
W.cellIns,
W.cellMerge,
W.customXmlDelRangeEnd,
W.customXmlDelRangeStart,
W.customXmlInsRangeEnd,
W.customXmlInsRangeStart,
W.customXmlMoveFromRangeEnd,
W.customXmlMoveFromRangeStart,
W.customXmlMoveToRangeEnd,
W.customXmlMoveToRangeStart,
W.del,
W.ins,
W.moveFrom,
W.moveFromRangeEnd,
W.moveFromRangeStart,
W.moveTo,
W.moveToRangeEnd,
W.moveToRangeStart,
W.pPrChange,
W.rPrChange,
W.sectPrChange,
W.tblGridChange,
W.tblPrChange,
W.tblPrExChange,
W.tcPrChange
};
public static int Match(IEnumerable<XElement> content, Regex regex)
{
return ReplaceInternal(content, regex, null, null, false, null, true);
}
/// <summary>
/// If callback == null Then returns count of matches in the content
/// If callback != null Then Match calls Found for each match
/// </summary>
public static int Match(IEnumerable<XElement> content, Regex regex, Action<XElement, Match> found)
{
return ReplaceInternal(content, regex, null,
(x, m) =>
{
if (found != null) found.Invoke(x, m);
return true;
},
false, null, true);
}
/// <summary>
/// If replacement == "new content" && callback == null
/// Then replaces all matches
/// If replacement == "" && callback == null)
/// Then deletes all matches
/// If replacement == "new content" && callback != null)
/// Then the callback can return true / false to indicate whether to replace or not
/// If the callback returns true once, and false on all subsequent calls, then this method replaces only the first found.
/// If replacement == "" && callback != null)
/// Then the callback can return true / false to indicate whether to delete or not
/// </summary>
public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
Func<XElement, Match, bool> doReplacement)
{
return ReplaceInternal(content, regex, replacement, doReplacement, false, null, true);
}
/// <summary>
/// This overload enables not coalescing content, which is necessary for DocumentAssembler.
/// </summary>
public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
Func<XElement, Match, bool> doReplacement, bool coalesceContent)
{
return ReplaceInternal(content, regex, replacement, doReplacement, false, null, coalesceContent);
}
/// <summary>
/// If replacement == "new content" && callback == null
/// Then replaces all matches
/// If replacement == "" && callback == null)
/// Then deletes all matches
/// If replacement == "new content" && callback != null)
/// Then the callback can return true / false to indicate whether to replace or not
/// If the callback returns true once, and false on all subsequent calls, then this method replaces only the first found.
/// If replacement == "" && callback != null)
/// Then the callback can return true / false to indicate whether to delete or not
/// If trackRevisions == true
/// Then replacement is done using revision tracking markup, with author as the revision tracking author
/// If trackRevisions == true for a PPTX
/// Then code throws an exception
/// </summary>
public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
Func<XElement, Match, bool> doReplacement, bool trackRevisions, string author)
{
return ReplaceInternal(content, regex, replacement, doReplacement, trackRevisions, author, true);
}
private static int ReplaceInternal(IEnumerable<XElement> content, Regex regex, string replacement,
Func<XElement, Match, bool> callback, bool trackRevisions, string revisionTrackingAuthor,
bool coalesceContent)
{
if (content == null) throw new ArgumentNullException("content");
if (regex == null) throw new ArgumentNullException("regex");
IEnumerable<XElement> contentList = content as IList<XElement> ?? content.ToList();
XElement first = contentList.FirstOrDefault();
if (first == null)
return 0;
if (first.Name.Namespace == W.w)
{
if (!contentList.Any())
return 0;
var replInfo = new ReplaceInternalInfo { Count = 0 };
foreach (XElement c in contentList)
{
var newC = (XElement) WmlSearchAndReplaceTransform(c, regex, replacement, callback, trackRevisions,
revisionTrackingAuthor, replInfo, coalesceContent);
c.ReplaceNodes(newC.Nodes());
}
XElement root = contentList.First().AncestorsAndSelf().Last();
int nextId = new[] { 0 }
.Concat(root
.Descendants()
.Where(d => RevTrackMarkupWithId.Contains(d.Name))
.Attributes(W.id)
.Select(a => (int) a))
.Max() + 1;
IEnumerable<XElement> revTrackingWithoutId = root
.DescendantsAndSelf()
.Where(d => RevTrackMarkupWithId.Contains(d.Name) && (d.Attribute(W.id) == null));
foreach (XElement item in revTrackingWithoutId)
item.Add(new XAttribute(W.id, nextId++));
List<IGrouping<int, XElement>> revTrackingWithDuplicateIds = root
.DescendantsAndSelf()
.Where(d => RevTrackMarkupWithId.Contains(d.Name))
.GroupBy(d => (int) d.Attribute(W.id))
.Where(g => g.Count() > 1)
.ToList();
foreach (IGrouping<int, XElement> group in revTrackingWithDuplicateIds)
foreach (XElement gc in group.Skip(1))
{
XAttribute xAttribute = gc.Attribute(W.id);
if (xAttribute != null) xAttribute.Value = nextId.ToString();
nextId++;
}
return replInfo.Count;
}
if ((first.Name.Namespace == P.p) || (first.Name.Namespace == A.a))
{
if (trackRevisions)
throw new OpenXmlPowerToolsException("PPTX does not support revision tracking");
var counter = new ReplaceInternalInfo { Count = 0 };
foreach (XElement c in contentList)
{
var newC = (XElement) PmlSearchAndReplaceTransform(c, regex, replacement, callback, counter);
c.ReplaceNodes(newC.Nodes());
}
return counter.Count;
}
return 0;
}
private static object WmlSearchAndReplaceTransform(XNode node, Regex regex, string replacement,
Func<XElement, Match, bool> callback, bool trackRevisions, string revisionTrackingAuthor,
ReplaceInternalInfo replInfo, bool coalesceContent)
{
var element = node as XElement;
if (element == null) return node;
if (element.Name == W.p)
{
XElement paragraph = element;
string preliminaryContent = paragraph
.DescendantsTrimmed(W.txbxContent)
.Where(d => d.Name == W.r && (d.Parent == null || d.Parent.Name != W.del))
.Select(UnicodeMapper.RunToString)
.StringConcatenate();
if (regex.IsMatch(preliminaryContent))
{
var paragraphWithSplitRuns = new XElement(W.p,
paragraph.Attributes(),
paragraph.Nodes().Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback,
trackRevisions, revisionTrackingAuthor, replInfo, coalesceContent)));
IEnumerable<XElement> runsTrimmed = paragraphWithSplitRuns
.DescendantsTrimmed(W.txbxContent)
.Where(d => d.Name == W.r && (d.Parent == null || d.Parent.Name != W.del));
var charsAndRuns = runsTrimmed
.Select(r => new { Ch = UnicodeMapper.RunToString(r), r })
.ToList();
string content = charsAndRuns.Select(t => t.Ch).StringConcatenate();
XElement[] alignedRuns = charsAndRuns.Select(t => t.r).ToArray();
MatchCollection matchCollection = regex.Matches(content);
replInfo.Count += matchCollection.Count;
// Process Match
if (replacement == null)
{
if (callback == null) return paragraph;
foreach (Match match in matchCollection.Cast<Match>())
callback(paragraph, match);
return paragraph;
}
// Process Replace
foreach (Match match in matchCollection.Cast<Match>())
{
if (match.Length == 0) continue;
if ((callback != null) && !callback(paragraph, match)) continue;
List<XElement> runCollection = alignedRuns
.Skip(match.Index)
.Take(match.Length)
.ToList();
// uses the Skip / Take special semantics of array to implement efficient finding of sub array
XElement firstRun = runCollection.First();
XElement firstRunProperties = firstRun.Elements(W.rPr).FirstOrDefault();
// save away first run properties
if (trackRevisions)
{
if (replacement != "")
{
// We coalesce runs as some methods, e.g., in DocumentAssembler,
// will try to find the replacement string even though they
// set coalesceContent to false.
string newTextValue = match.Result(replacement);
List<XElement> newRuns = UnicodeMapper.StringToCoalescedRunList(newTextValue,
firstRunProperties);
var newIns = new XElement(W.ins,
new XAttribute(W.author, revisionTrackingAuthor),
new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
newRuns);
if (firstRun.Parent != null && firstRun.Parent.Name == W.ins)
firstRun.Parent.AddBeforeSelf(newIns);
else
firstRun.AddBeforeSelf(newIns);
}
foreach (XElement run in runCollection)
{
bool isInIns = run.Parent != null && run.Parent.Name == W.ins;
if (isInIns)
{
XElement parentIns = run.Parent;
XElement grandParentParagraph = parentIns.Parent;
if (grandParentParagraph != null)
{
if ((string) parentIns.Attributes(W.author).FirstOrDefault() ==
revisionTrackingAuthor)
{
List<XElement> parentInsSiblings = grandParentParagraph
.Elements()
.Where(c => c != parentIns)
.ToList();
grandParentParagraph.ReplaceNodes(parentInsSiblings);
}
else
{
List<XElement> parentInsSiblings = grandParentParagraph
.Elements()
.Select(c => c == parentIns
? new XElement(W.ins,
parentIns.Attributes(),
new XElement(W.del,
new XAttribute(W.author, revisionTrackingAuthor),
new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
parentIns.Elements().Select(TransformToDelText)))
: c)
.ToList();
grandParentParagraph.ReplaceNodes(parentInsSiblings);
}
}
}
else
{
var delRun = new XElement(W.del,
new XAttribute(W.author, revisionTrackingAuthor),
new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
TransformToDelText(run));
run.ReplaceWith(delRun);
}
}
}
else // not tracked revisions
{
foreach (XElement runToDelete in runCollection.Skip(1).ToList())
if (runToDelete.Parent != null && runToDelete.Parent.Name == W.ins)
runToDelete.Parent.Remove();
else
runToDelete.Remove();
// We coalesce runs as some methods, e.g., in DocumentAssembler,
// will try to find the replacement string even though they
// set coalesceContent to false.
string newTextValue = match.Result(replacement);
List<XElement> newRuns = UnicodeMapper.StringToCoalescedRunList(newTextValue,
firstRunProperties);
if (firstRun.Parent != null && firstRun.Parent.Name == W.ins)
firstRun.Parent.ReplaceWith(newRuns);
else
firstRun.ReplaceWith(newRuns);
}
}
return coalesceContent
? WordprocessingMLUtil.CoalesceAdjacentRunsWithIdenticalFormatting(paragraphWithSplitRuns)
: paragraphWithSplitRuns;
}
var newParagraph = new XElement(W.p,
paragraph.Attributes(),
paragraph.Nodes().Select(n =>
{
var e = n as XElement;
if (e == null) return n;
if (e.Name == W.pPr)
return e;
if (((e.Name == W.r) && e.Elements(W.t).Any()) || e.Elements(W.tab).Any())
return e;
if ((e.Name == W.ins) && e.Elements(W.r).Elements(W.t).Any())
return e;
return WmlSearchAndReplaceTransform(e, regex, replacement, callback,
trackRevisions, revisionTrackingAuthor, replInfo, coalesceContent);
}));
return coalesceContent
? WordprocessingMLUtil.CoalesceAdjacentRunsWithIdenticalFormatting(newParagraph) // CoalesceContent(newParagraph)
: newParagraph;
}
if (element.Name == W.ins && element.Elements(W.r).Any())
{
List<object> collectionOfCollections = element
.Elements()
.Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback, trackRevisions,
revisionTrackingAuthor, replInfo, coalesceContent))
.ToList();
List<object> collectionOfIns = collectionOfCollections
.Select(c =>
{
var elements = c as IEnumerable<XElement>;
return elements != null
? elements.Select(ixc => new XElement(W.ins, element.Attributes(), ixc))
: c;
})
.ToList();
return collectionOfIns;
}
if (element.Name == W.r)
{
return element.Elements()
.Where(e => e.Name != W.rPr)
.Select(e => e.Name == W.t
? ((string) e).Select(c =>
new XElement(W.r,
element.Elements(W.rPr),
new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(c), c)))
: new[] { new XElement(W.r, element.Elements(W.rPr), e) })
.SelectMany(t => t);
}
return new XElement(element.Name,
element.Attributes(),
element.Nodes()
.Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback, trackRevisions,
revisionTrackingAuthor, replInfo, coalesceContent)));
}
private static object TransformToDelText(XNode node)
{
var element = node as XElement;
if (element == null) return node;
if (element.Name == W.t)
return new XElement(W.delText,
XmlUtil.GetXmlSpaceAttribute(element.Value),
element.Value);
return new XElement(element.Name,
element.Attributes(),
element.Nodes().Select(TransformToDelText));
}
private static object PmlSearchAndReplaceTransform(XNode node, Regex regex, string replacement,
Func<XElement, Match, bool> callback, ReplaceInternalInfo counter)
{
var element = node as XElement;
if (element == null) return node;
if (element.Name == A.p)
{
XElement paragraph = element;
string contents = element.Descendants(A.t).Select(t => (string) t).StringConcatenate();
if (!regex.IsMatch(contents))
return new XElement(element.Name, element.Attributes(), element.Nodes());
var paragraphWithSplitRuns = new XElement(A.p,
paragraph.Attributes(),
paragraph.Nodes()
.Select(n => PmlSearchAndReplaceTransform(n, regex, replacement, callback, counter)));
List<XElement> runsTrimmed = paragraphWithSplitRuns
.Descendants(A.r)
.ToList();
var charsAndRuns = runsTrimmed
.Select(r =>
r.Element(A.t) != null
? new { Ch = r.Element(A.t).Value, r }
: new { Ch = "\x01", r })
.ToList();
string content = charsAndRuns.Select(t => t.Ch).StringConcatenate();
XElement[] alignedRuns = charsAndRuns.Select(t => t.r).ToArray();
MatchCollection matchCollection = regex.Matches(content);
counter.Count += matchCollection.Count;
if (replacement == null)
{
foreach (Match match in matchCollection.Cast<Match>())
callback(paragraph, match);
}
else
{
foreach (Match match in matchCollection.Cast<Match>())
{
if ((callback != null) && !callback(paragraph, match)) continue;
List<XElement> runCollection = alignedRuns
.Skip(match.Index)
.Take(match.Length)
.ToList();
// uses the Skip / Take special semantics of array to implement efficient finding of sub array
XElement firstRun = runCollection.First();
// save away first run because we want the run properties
runCollection.Skip(1).Remove();
// binds to Remove(this IEnumerable<XElement> elements), which is an extension
// in LINQ to XML that uses snapshot semantics and removes every element from
// its parent.
var newFirstRun = new XElement(A.r,
firstRun.Element(A.rPr),
new XElement(A.t, replacement));
// creates a new run with proper run properties
firstRun.ReplaceWith(newFirstRun);
// finds firstRun in its parent's list of children, unparents firstRun,
// sets newFirstRun's parent to firstRuns old parent, and inserts in the list
// of children at the right place.
}
XElement paragraphWithReplacedRuns = paragraphWithSplitRuns;
IEnumerable<IGrouping<string, XElement>> groupedAdjacentRunsWithIdenticalFormatting =
paragraphWithReplacedRuns
.Elements()
.GroupAdjacent(ce =>
{
if (ce.Name != A.r)
return DontConsolidate;
if ((ce.Elements().Count(e => e.Name != A.rPr) != 1) || (ce.Element(A.t) == null))
return DontConsolidate;
XElement rPr = ce.Element(A.rPr);
return rPr == null ? "" : rPr.ToString(SaveOptions.None);
});
var paragraphWithConsolidatedRuns = new XElement(A.p,
groupedAdjacentRunsWithIdenticalFormatting.Select(g =>
{
if (g.Key == DontConsolidate)
return (object) g;
string textValue = g.Select(r => r.Element(A.t).Value).StringConcatenate();
XAttribute xs = XmlUtil.GetXmlSpaceAttribute(textValue);
return new XElement(A.r,
g.First().Elements(A.rPr),
new XElement(A.t, xs, textValue));
}));
paragraph = paragraphWithConsolidatedRuns;
}
return paragraph;
}
if ((element.Name == A.r) && element.Elements(A.t).Any())
{
return element.Elements()
.Where(e => e.Name != A.rPr)
.Select(e =>
{
if (e.Name == A.t)
{
var s = (string) e;
IEnumerable<XElement> collectionOfSubRuns = s.Select(c => new XElement(A.r,
element.Elements(A.rPr),
new XElement(A.t, XmlUtil.GetXmlSpaceAttribute(c), c)));
return (object) collectionOfSubRuns;
}
return new XElement(A.r,
element.Elements(A.rPr),
e);
});
}
return new XElement(element.Name,
element.Attributes(),
element.Nodes().Select(n => PmlSearchAndReplaceTransform(n, regex, replacement, callback, counter)));
}
private class ReplaceInternalInfo
{
public int Count;
}
}
}