blob: 10654f52cb7c29c18c55de71212f62d1671489df [file] [log] [blame]
/***************************************************************************
Copyright (c) Microsoft Corporation 2016.
This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:
http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx
Developer: Thomas Barnekow
Email: thomas@barnekow.info
***************************************************************************/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml.Linq;
namespace OpenXmlPowerTools
{
public class UnicodeMapper
{
// Unicode character values.
public static readonly char StartOfHeading = '\u0001';
public static readonly char HorizontalTabulation = '\u0009';
public static readonly char LineFeed = '\u000A';
public static readonly char FormFeed = '\u000C';
public static readonly char CarriageReturn = '\u000D';
public static readonly char SoftHyphen = '\u00AD';
public static readonly char NonBreakingHyphen = '\u2011';
// Unicode area boundaries.
public static readonly char StartOfPrivateUseArea = '\uE000';
public static readonly char StartOfSymbolArea = '\uF000';
public static readonly char EndOfPrivateUseArea = '\uF8FF';
// Dictionaries for w:sym stringification.
private static readonly Dictionary<string, char> SymStringToUnicodeCharDictionary =
new Dictionary<string, char>();
private static readonly Dictionary<char, XElement> UnicodeCharToSymDictionary =
new Dictionary<char, XElement>();
// Represents the Unicode value that was last used to map an actual character
// onto a special value in the private use area, which starts at U+E000.
// In Open XML, U+F000 is added to the actual Unicode values, so we should be
// well outside that range and would have to map 4096 different characters
// to get into the area starting at U+F000.
private static char _lastUnicodeChar = StartOfPrivateUseArea;
/// <summary>
/// Stringify an Open XML run, turning (a) w:t, w:br, w:cr, w:noBreakHyphen,
/// w:softHyphen, w:sym, and w:tab into their corresponding Unicode strings
/// and (b) everything else into U+0001.
/// </summary>
/// <param name="element">An Open XML run or run child element.</param>
/// <returns>The corresponding Unicode value or U+0001.</returns>
public static string RunToString(XElement element)
{
if (element.Name == W.r && (element.Parent == null || element.Parent.Name != W.del))
return element.Elements().Select(RunToString).StringConcatenate();
// We need to ignore run properties.
if (element.Name == W.rPr)
return string.Empty;
// For w:t elements, we obviously want the element's value.
if (element.Name == W.t)
return (string) element;
// Turn elements representing special characters into their corresponding
// unicode characters.
if (element.Name == W.br)
{
XAttribute typeAttribute = element.Attribute(W.type);
string type = typeAttribute != null ? typeAttribute.Value : null;
if (type == null || type == "textWrapping")
return CarriageReturn.ToString();
if (type == "page")
return FormFeed.ToString();
}
if (element.Name == W.cr)
return CarriageReturn.ToString();
if (element.Name == W.noBreakHyphen)
return NonBreakingHyphen.ToString();
if (element.Name == W.softHyphen)
return SoftHyphen.ToString();
if (element.Name == W.tab)
return HorizontalTabulation.ToString();
if (element.Name == W.fldChar)
{
var fldCharType = element.Attributes(W.fldCharType).Select(a => a.Value).FirstOrDefault();
switch (fldCharType)
{
case "begin":
return "{";
case "end":
return "}";
default:
return "_";
}
}
if (element.Name == W.instrText)
return "_";
// Turn w:sym elements into Unicode character values. A w:char attribute
// value can be stored (a) directly in its Unicode character value from
// the font glyph or (b) in a Unicode character value created by adding
// U+F000 to the character value, thereby shifting the value into the
// Unicode private use area.
if (element.Name == W.sym)
return SymToChar(element).ToString();
// Elements we don't recognize will be turned into a character that
// doesn't typically appear in documents.
return StartOfHeading.ToString();
}
/// <summary>
/// Translate a symbol into a Unicode character, using the specified w:font attribute
/// value and unicode value (represented by the w:sym element's w:char attribute),
/// using a substitute value for the actual Unicode value if the same Unicode value
/// is already used in conjunction with a different w:font attribute value.
///
/// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
/// shifts the value into the Unicode private use area (which is also done by MS Word).
/// </summary>
/// <remarks>
/// For w:sym elements, the w:char attribute value is typically greater than "F000",
/// because U+F000 is added to the actual Unicode value to shift the value into
/// the Unicode private use area.
/// </remarks>
/// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
/// <param name="unicodeValue">The unicode value.</param>
/// <returns>The Unicode character used to represent the symbol.</returns>
public static char SymToChar(string fontAttributeValue, char unicodeValue)
{
return SymToChar(fontAttributeValue, (int) unicodeValue);
}
/// <summary>
/// Translate a symbol into a Unicode character, using the specified w:font attribute
/// value and unicode value (represented by the w:sym element's w:char attribute),
/// using a substitute value for the actual Unicode value if the same Unicode value
/// is already used in conjunction with a different w:font attribute value.
///
/// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
/// shifts the value into the Unicode private use area (which is also done by MS Word).
/// </summary>
/// <remarks>
/// For w:sym elements, the w:char attribute value is typically greater than "F000",
/// because U+F000 is added to the actual Unicode value to shift the value into
/// the Unicode private use area.
/// </remarks>
/// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
/// <param name="unicodeValue">The unicode value.</param>
/// <returns>The Unicode character used to represent the symbol.</returns>
public static char SymToChar(string fontAttributeValue, int unicodeValue)
{
int effectiveUnicodeValue = unicodeValue < 0x1000 ? 0xF000 + unicodeValue : unicodeValue;
return SymToChar(fontAttributeValue, effectiveUnicodeValue.ToString("X4"));
}
/// <summary>
/// Translate a symbol into a Unicode character, using the specified w:font and
/// w:char attribute values, using a substitute value for the actual Unicode
/// value if the same Unicode value is already used in conjunction with a different
/// w:font attribute value.
///
/// Do not alter the w:char attribute value.
/// </summary>
/// <remarks>
/// For w:sym elements, the w:char attribute value is typically greater than "F000",
/// because U+F000 is added to the actual Unicode value to shift the value into
/// the Unicode private use area.
/// </remarks>
/// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
/// <param name="charAttributeValue">The w:char attribute value, e.g., "F028".</param>
/// <returns>The Unicode character used to represent the symbol.</returns>
public static char SymToChar(string fontAttributeValue, string charAttributeValue)
{
if (string.IsNullOrEmpty(fontAttributeValue))
throw new ArgumentException("Argument is null or empty.", "fontAttributeValue");
if (string.IsNullOrEmpty(charAttributeValue))
throw new ArgumentException("Argument is null or empty.", "charAttributeValue");
return SymToChar(new XElement(W.sym,
new XAttribute(W.font, fontAttributeValue),
new XAttribute(W._char, charAttributeValue),
new XAttribute(XNamespace.Xmlns + "w", W.w)));
}
/// <summary>
/// Represent a w:sym element as a Unicode value, mapping the Unicode value
/// specified in the w:char attribute to a substitute value to be able to
/// use a Unicode value in conjunction with different fonts.
/// </summary>
/// <param name="sym">The w:sym element to be stringified.</param>
/// <returns>A single-character Unicode string representing the w:sym element.</returns>
public static char SymToChar(XElement sym)
{
if (sym == null)
throw new ArgumentNullException("sym");
if (sym.Name != W.sym)
throw new ArgumentException(string.Format("Not a w:sym: {0}", sym.Name), "sym");
XAttribute fontAttribute = sym.Attribute(W.font);
string fontAttributeValue = fontAttribute != null ? fontAttribute.Value : null;
if (fontAttributeValue == null)
throw new ArgumentException("w:sym element has no w:font attribute.", "sym");
XAttribute charAttribute = sym.Attribute(W._char);
string charAttributeValue = charAttribute != null ? charAttribute.Value : null;
if (charAttributeValue == null)
throw new ArgumentException("w:sym element has no w:char attribute.", "sym");
// Return Unicode value if it is in the dictionary.
var standardizedSym = new XElement(W.sym,
new XAttribute(W.font, fontAttributeValue),
new XAttribute(W._char, charAttributeValue),
new XAttribute(XNamespace.Xmlns + "w", W.w));
string standardizedSymString = standardizedSym.ToString(SaveOptions.None);
if (SymStringToUnicodeCharDictionary.ContainsKey(standardizedSymString))
return SymStringToUnicodeCharDictionary[standardizedSymString];
// Determine Unicode value to be used to represent the current w:sym element.
// Use the actual Unicode value if it has not yet been used with another font.
// Otherwise, create a special Unicode value in the private use area to represent
// the current w:sym element.
var unicodeChar = (char) Convert.ToInt32(charAttributeValue, 16);
if (UnicodeCharToSymDictionary.ContainsKey(unicodeChar))
unicodeChar = ++_lastUnicodeChar;
SymStringToUnicodeCharDictionary.Add(standardizedSymString, unicodeChar);
UnicodeCharToSymDictionary.Add(unicodeChar, standardizedSym);
return unicodeChar;
}
/// <summary>
/// Turn the specified text value into a list of runs with coalesced text elements.
/// Each run will have the specified run properties.
/// </summary>
/// <param name="textValue">The text value to transform.</param>
/// <param name="runProperties">The run properties to apply.</param>
/// <returns>A list of runs representing the text value.</returns>
public static List<XElement> StringToCoalescedRunList(string textValue, XElement runProperties)
{
return textValue
.Select(CharToRunChild)
.GroupAdjacent(e => e.Name == W.t)
.SelectMany(grouping => grouping.Key
? StringToSingleRunList(grouping.Select(t => (string) t).StringConcatenate(), runProperties)
: grouping.Select(e => new XElement(W.r, runProperties, e)))
.ToList();
}
/// <summary>
/// Turn the specified text value into a list consisting of a single run having one
/// text element with that text value. The run will have the specified run properties.
/// </summary>
/// <param name="textValue">The text value to transform.</param>
/// <param name="runProperties">The run properties to apply.</param>
/// <returns>A list with a single run.</returns>
public static IEnumerable<XElement> StringToSingleRunList(string textValue, XElement runProperties)
{
var run = new XElement(W.r,
runProperties,
new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(textValue), textValue));
return new List<XElement> { run };
}
/// <summary>
/// Turn the specified text value into a list of runs, each having the specified
/// run properties.
/// </summary>
/// <param name="textValue">The text value to transform.</param>
/// <param name="runProperties">The run properties to apply.</param>
/// <returns>A list of runs representing the text value.</returns>
public static List<XElement> StringToRunList(string textValue, XElement runProperties)
{
return textValue.Select(character => CharToRun(character, runProperties)).ToList();
}
/// <summary>
/// Create a w:r element from the specified character, which will be turned
/// into a corresponding Open XML element (e.g., w:t, w:br, w:tab).
/// </summary>
/// <param name="character">The character.</param>
/// <param name="runProperties">The w:rPr element to be added to the w:r element.</param>
/// <returns>The w:r element.</returns>
public static XElement CharToRun(char character, XElement runProperties)
{
return new XElement(W.r, runProperties, CharToRunChild(character));
}
/// <summary>
/// Create an Open XML element (e.g., w:t, w:br, w:tab) from the specified
/// character.
/// </summary>
/// <param name="character">The character.</param>
/// <returns>The Open XML element or null, if the character equals <see cref="StartOfHeading" /> (U+0001).</returns>
public static XElement CharToRunChild(char character)
{
// Ignore the special character that represents the Open XML elements we
// wanted to ignore.
if (character == StartOfHeading)
return null;
// Translate special characters into their corresponding Open XML elements.
// Turn a Carriage Return into an empty w:br element, regardless of whether
// the former was created from an equivalent w:cr element.
if (character == CarriageReturn)
return new XElement(W.br);
if (character == FormFeed)
return new XElement(W.br, new XAttribute(W.type, "page"));
if (character == HorizontalTabulation)
return new XElement(W.tab);
if (character == NonBreakingHyphen)
return new XElement(W.noBreakHyphen);
if (character == SoftHyphen)
return new XElement(W.softHyphen);
// Translate symbol characters into their corresponding w:sym elements.
if (UnicodeCharToSymDictionary.ContainsKey(character))
return UnicodeCharToSymDictionary[character];
// Turn "normal" characters into text elements.
return new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(character), character);
}
}
}