EPPlus/FormulaParsing/LexicalAnalysis/SourceCodeTokenizer.cs - epplus - Git at Google

 /*******************************************************************************
  * You may amend and distribute as you like, but don't remove this header!
  *
  * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
  * See http://www.codeplex.com/EPPlus for details.
  *
  * Copyright (C) 2011  Jan Källman
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.

  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU Lesser General Public License for more details.
  *
  * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
  * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
  *
  * All code and executables are provided "as is" with no warranty either express or implied.
  * The author accepts no liability for any damage or loss of business that this product may cause.
  *
  * Code change notes:
  *
  * Author							Change						Date
  * ******************************************************************************
  * Mats Alm   		                Added       		        2013-03-01 (Prior file history on https://github.com/swmal/ExcelFormulaParser)
  *******************************************************************************/

 using System.Collections.Generic;
 using System.Globalization;
 using System.Linq;
 using System.Text.RegularExpressions;
 using OfficeOpenXml.FormulaParsing.Excel.Functions;

 namespace OfficeOpenXml.FormulaParsing.LexicalAnalysis;

 public class SourceCodeTokenizer : ISourceCodeTokenizer {
   public static ISourceCodeTokenizer Default =>
     new SourceCodeTokenizer(FunctionNameProvider.Empty, NameValueProvider.Empty);

   public SourceCodeTokenizer(
       IFunctionNameProvider functionRepository,
       INameValueProvider nameValueProvider)
       : this(
           new TokenFactory(functionRepository, nameValueProvider),
           new TokenSeparatorProvider()) {}

   public SourceCodeTokenizer(ITokenFactory tokenFactory, ITokenSeparatorProvider tokenProvider) {
     _tokenFactory = tokenFactory;
     _tokenProvider = tokenProvider;
   }

   private readonly ITokenSeparatorProvider _tokenProvider;
   private readonly ITokenFactory _tokenFactory;

   public IEnumerable<Token> Tokenize(string input) {
     return Tokenize(input, null);
   }

   public IEnumerable<Token> Tokenize(string input, string worksheet) {
     if (string.IsNullOrEmpty(input)) {
       return Enumerable.Empty<Token>();
     }
     // MA 1401: Ignore leading plus in formula.
     input = input.TrimStart('+');
     var context = new TokenizerContext(input);

     bool isSingleQuoteString = false;
     for (int i = 0; i < context.FormulaChars.Length; i++) {
       var c = context.FormulaChars[i];
       if (CharIsTokenSeparator(c, out var tokenSeparator)) {
         if (context.IsInString) {
           if (IsDoubleQuote(tokenSeparator, i, context)) {
             i++;
             context.AppendToCurrentToken(c);
             continue;
           }
           if (tokenSeparator.TokenType != TokenType.String) {
             context.AppendToCurrentToken(c);
             continue;
           }
           // CHANGE 2
           if ((isSingleQuoteString && c != '\'') || (!isSingleQuoteString && c != '"')) {
             context.AppendToCurrentToken(c);
             continue;
           }
         }
         if (tokenSeparator.TokenType == TokenType.OpeningBracket) {
           context.AppendToCurrentToken(c);
           context.BracketCount++;
           continue;
         }
         if (tokenSeparator.TokenType == TokenType.ClosingBracket) {
           context.AppendToCurrentToken(c);
           context.BracketCount--;
           continue;
         }
         if (context.BracketCount > 0) {
           context.AppendToCurrentToken(c);
           continue;
         }
         // two operators in sequence could be "<=" or ">="
         if (IsPartOfMultipleCharSeparator(context, c)) {
           var sOp = context.LastToken.Value + c.ToString(CultureInfo.InvariantCulture);
           var op = _tokenProvider.Tokens[sOp];
           context.ReplaceLastToken(op);
           context.NewToken();
           continue;
         }
         if (tokenSeparator.TokenType == TokenType.String) {
           // CHANGE3 :
           isSingleQuoteString = (c == '\'');
           if (context.LastToken != null
               && context.LastToken.TokenType == TokenType.OpeningEnumerable) {
             // context.AppendToCurrentToken(c);  // Praveen's change of 10/28/2015
             context.ToggleIsInString();
             continue;
           }
           if (context.LastToken != null && context.LastToken.TokenType == TokenType.String) {
             context.AddToken(
                 !context.CurrentTokenHasValue
                     ? new(string.Empty, TokenType.StringContent)
                     : new Token(context.CurrentToken, TokenType.StringContent));
           }
           context.AddToken(new("\"", TokenType.String));
           context.ToggleIsInString();
           context.NewToken();
           continue;
         }
         if (context.CurrentTokenHasValue) {
           if (Regex.IsMatch(context.CurrentToken, "^\"*$")) {
             context.AddToken(_tokenFactory.Create(context.CurrentToken, TokenType.StringContent));
           } else {
             context.AddToken(CreateToken(context, worksheet));
           }

           //If the a next token is an opening parantheses and the previous token is interpeted as an address or name, then the currenct token is a function
           if (tokenSeparator.TokenType == TokenType.OpeningParenthesis
               && (context.LastToken.TokenType == TokenType.ExcelAddress
                       || context.LastToken.TokenType == TokenType.NameValue)) {
             context.LastToken.TokenType = TokenType.Function;
           }
         }
         if (tokenSeparator.Value == "-") {
           if (TokenIsNegator(context)) {
             context.AddToken(new("-", TokenType.Negator));
             continue;
           }
         }
         context.AddToken(tokenSeparator);
         context.NewToken();
         continue;
       }
       context.AppendToCurrentToken(c);
     }
     if (context.CurrentTokenHasValue) {
       context.AddToken(CreateToken(context, worksheet));
     }

     CleanupTokens(context, _tokenProvider.Tokens);

     return context.Result;
   }

   private static bool IsDoubleQuote(
       Token tokenSeparator,
       int formulaCharIndex,
       TokenizerContext context) {
     return tokenSeparator.TokenType == TokenType.String
         && formulaCharIndex + 1 < context.FormulaChars.Length
         && context.FormulaChars[formulaCharIndex + 1] == '\"';
   }

   private static void CleanupTokens(TokenizerContext context, IDictionary<string, Token> tokens) {
     for (int i = 0; i < context.Result.Count; i++) {
       var token = context.Result[i];
       if (token.TokenType == TokenType.Unrecognized) {
         if (i < context.Result.Count - 1) {
           if (context.Result[i + 1].TokenType == TokenType.OpeningParenthesis) {
             token.TokenType = TokenType.Function;
           } else {
             token.TokenType = TokenType.NameValue;
           }
         } else {
           token.TokenType = TokenType.NameValue;
         }
       } else if (token.TokenType == TokenType.Function) {
         if (i < context.Result.Count - 1) {
           if (context.Result[i + 1].TokenType == TokenType.OpeningParenthesis) {
             token.TokenType = TokenType.Function;
           } else {
             token.TokenType = TokenType.Unrecognized;
           }
         } else {
           token.TokenType = TokenType.Unrecognized;
         }
       } else if ((token.TokenType == TokenType.Operator || token.TokenType == TokenType.Negator)
           && i < context.Result.Count - 1
           && (token.Value == "+" || token.Value == "-")) {
         if (i > 0
             && token.Value
                 == "+") //Remove any + with an opening parenthesis before.
         {
           if (context.Result[i - 1].TokenType == TokenType.OpeningParenthesis) {
             context.Result.RemoveAt(i);
             SetNegatorOperator(context, i, tokens);
             i--;
             continue;
           }
         }

         var nextToken = context.Result[i + 1];
         if (nextToken.TokenType == TokenType.Operator || nextToken.TokenType == TokenType.Negator) {
           if (token.Value == "+" && (nextToken.Value == "+" || nextToken.Value == "-")) {
             //Remove first
             context.Result.RemoveAt(i);
             SetNegatorOperator(context, i, tokens);
             i--;
           } else if (token.Value == "-" && nextToken.Value == "+") {
             //Remove second
             context.Result.RemoveAt(i + 1);
             SetNegatorOperator(context, i, tokens);
             i--;
           } else if (token.Value == "-" && nextToken.Value == "-") {
             //Remove first and set operator to +
             context.Result.RemoveAt(i);
             if (i == 0) {
               context.Result.RemoveAt(i + 1);
               i += 2;
             } else {
               //context.Result[i].TokenType = TokenType.Operator;
               //context.Result[i].Value = "+";
               context.Result[i] = tokens["+"];
               SetNegatorOperator(context, i, tokens);
               i--;
             }
           }
         }
       }
     }
   }

   private static void SetNegatorOperator(
       TokenizerContext context,
       int i,
       IDictionary<string, Token> tokens) {
     if (context.Result[i].Value == "-"
         && i > 0
         && (context.Result[i].TokenType == TokenType.Operator
                 || context.Result[i].TokenType == TokenType.Negator)) {
       if (TokenIsNegator(context.Result[i - 1])) {
         context.Result[i] = new("-", TokenType.Negator);
       } else {
         context.Result[i] = tokens["-"];
       }
     }
   }

   private static bool TokenIsNegator(TokenizerContext context) {
     return TokenIsNegator(context.LastToken);
   }

   private static bool TokenIsNegator(Token t) {
     return t == null
         || t.TokenType == TokenType.Operator
         || t.TokenType == TokenType.OpeningParenthesis
         || t.TokenType == TokenType.Comma
         || t.TokenType == TokenType.SemiColon
         || t.TokenType == TokenType.OpeningEnumerable;
   }

   private bool IsPartOfMultipleCharSeparator(TokenizerContext context, char c) {
     var lastToken = context.LastToken != null ? context.LastToken.Value : string.Empty;
     return _tokenProvider.IsOperator(lastToken)
         && _tokenProvider.IsPossibleLastPartOfMultipleCharOperator(
             c.ToString(CultureInfo.InvariantCulture))
         && !context.CurrentTokenHasValue;
   }

   private Token CreateToken(TokenizerContext context, string worksheet) {
     if (context.CurrentToken == "-") {
       if (context.LastToken == null && context.LastToken.TokenType == TokenType.Operator) {
         return new("-", TokenType.Negator);
       }
     }
     return _tokenFactory.Create(context.Result, context.CurrentToken, worksheet);
   }

   private bool CharIsTokenSeparator(char c, out Token token) {
     var result = _tokenProvider.Tokens.ContainsKey(c.ToString());
     token = result ? token = _tokenProvider.Tokens[c.ToString()] : null;
     return result;
   }
 }
	/*******************************************************************************
	* You may amend and distribute as you like, but don't remove this header!
	*
	* EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
	* See http://www.codeplex.com/EPPlus for details.
	*
	* Copyright (C) 2011 Jan Källman
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.

	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	* See the GNU Lesser General Public License for more details.
	*
	* The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
	* If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
	*
	* All code and executables are provided "as is" with no warranty either express or implied.
	* The author accepts no liability for any damage or loss of business that this product may cause.
	*
	* Code change notes:
	*
	* Author Change Date
	* ******************************************************************************
	* Mats Alm Added 2013-03-01 (Prior file history on https://github.com/swmal/ExcelFormulaParser)
	*******************************************************************************/

	using System.Collections.Generic;
	using System.Globalization;
	using System.Linq;
	using System.Text.RegularExpressions;
	using OfficeOpenXml.FormulaParsing.Excel.Functions;

	namespace OfficeOpenXml.FormulaParsing.LexicalAnalysis;

	public class SourceCodeTokenizer : ISourceCodeTokenizer {
	public static ISourceCodeTokenizer Default =>
	new SourceCodeTokenizer(FunctionNameProvider.Empty, NameValueProvider.Empty);

	public SourceCodeTokenizer(
	IFunctionNameProvider functionRepository,
	INameValueProvider nameValueProvider)
	: this(
	new TokenFactory(functionRepository, nameValueProvider),
	new TokenSeparatorProvider()) {}

	public SourceCodeTokenizer(ITokenFactory tokenFactory, ITokenSeparatorProvider tokenProvider) {
	_tokenFactory = tokenFactory;
	_tokenProvider = tokenProvider;
	}

	private readonly ITokenSeparatorProvider _tokenProvider;
	private readonly ITokenFactory _tokenFactory;

	public IEnumerable<Token> Tokenize(string input) {
	return Tokenize(input, null);
	}

	public IEnumerable<Token> Tokenize(string input, string worksheet) {
	if (string.IsNullOrEmpty(input)) {
	return Enumerable.Empty<Token>();
	}
	// MA 1401: Ignore leading plus in formula.
	input = input.TrimStart('+');
	var context = new TokenizerContext(input);

	bool isSingleQuoteString = false;
	for (int i = 0; i < context.FormulaChars.Length; i++) {
	var c = context.FormulaChars[i];
	if (CharIsTokenSeparator(c, out var tokenSeparator)) {
	if (context.IsInString) {
	if (IsDoubleQuote(tokenSeparator, i, context)) {
	i++;
	context.AppendToCurrentToken(c);
	continue;
	}
	if (tokenSeparator.TokenType != TokenType.String) {
	context.AppendToCurrentToken(c);
	continue;
	}
	// CHANGE 2
	if ((isSingleQuoteString && c != '\'') \|\| (!isSingleQuoteString && c != '"')) {
	context.AppendToCurrentToken(c);
	continue;
	}
	}
	if (tokenSeparator.TokenType == TokenType.OpeningBracket) {
	context.AppendToCurrentToken(c);
	context.BracketCount++;
	continue;
	}
	if (tokenSeparator.TokenType == TokenType.ClosingBracket) {
	context.AppendToCurrentToken(c);
	context.BracketCount--;
	continue;
	}
	if (context.BracketCount > 0) {
	context.AppendToCurrentToken(c);
	continue;
	}
	// two operators in sequence could be "<=" or ">="
	if (IsPartOfMultipleCharSeparator(context, c)) {
	var sOp = context.LastToken.Value + c.ToString(CultureInfo.InvariantCulture);
	var op = _tokenProvider.Tokens[sOp];
	context.ReplaceLastToken(op);
	context.NewToken();
	continue;
	}
	if (tokenSeparator.TokenType == TokenType.String) {
	// CHANGE3 :
	isSingleQuoteString = (c == '\'');
	if (context.LastToken != null
	&& context.LastToken.TokenType == TokenType.OpeningEnumerable) {
	// context.AppendToCurrentToken(c); // Praveen's change of 10/28/2015
	context.ToggleIsInString();
	continue;
	}
	if (context.LastToken != null && context.LastToken.TokenType == TokenType.String) {
	context.AddToken(
	!context.CurrentTokenHasValue
	? new(string.Empty, TokenType.StringContent)
	: new Token(context.CurrentToken, TokenType.StringContent));
	}
	context.AddToken(new("\"", TokenType.String));
	context.ToggleIsInString();
	context.NewToken();
	continue;
	}
	if (context.CurrentTokenHasValue) {
	if (Regex.IsMatch(context.CurrentToken, "^\"*$")) {
	context.AddToken(_tokenFactory.Create(context.CurrentToken, TokenType.StringContent));
	} else {
	context.AddToken(CreateToken(context, worksheet));
	}

	//If the a next token is an opening parantheses and the previous token is interpeted as an address or name, then the currenct token is a function
	if (tokenSeparator.TokenType == TokenType.OpeningParenthesis
	&& (context.LastToken.TokenType == TokenType.ExcelAddress
	\|\| context.LastToken.TokenType == TokenType.NameValue)) {
	context.LastToken.TokenType = TokenType.Function;
	}
	}
	if (tokenSeparator.Value == "-") {
	if (TokenIsNegator(context)) {
	context.AddToken(new("-", TokenType.Negator));
	continue;
	}
	}
	context.AddToken(tokenSeparator);
	context.NewToken();
	continue;
	}
	context.AppendToCurrentToken(c);
	}
	if (context.CurrentTokenHasValue) {
	context.AddToken(CreateToken(context, worksheet));
	}

	CleanupTokens(context, _tokenProvider.Tokens);

	return context.Result;
	}

	private static bool IsDoubleQuote(
	Token tokenSeparator,
	int formulaCharIndex,
	TokenizerContext context) {
	return tokenSeparator.TokenType == TokenType.String
	&& formulaCharIndex + 1 < context.FormulaChars.Length
	&& context.FormulaChars[formulaCharIndex + 1] == '\"';
	}

	private static void CleanupTokens(TokenizerContext context, IDictionary<string, Token> tokens) {
	for (int i = 0; i < context.Result.Count; i++) {
	var token = context.Result[i];
	if (token.TokenType == TokenType.Unrecognized) {
	if (i < context.Result.Count - 1) {
	if (context.Result[i + 1].TokenType == TokenType.OpeningParenthesis) {
	token.TokenType = TokenType.Function;
	} else {
	token.TokenType = TokenType.NameValue;
	}
	} else {
	token.TokenType = TokenType.NameValue;
	}
	} else if (token.TokenType == TokenType.Function) {
	if (i < context.Result.Count - 1) {
	if (context.Result[i + 1].TokenType == TokenType.OpeningParenthesis) {
	token.TokenType = TokenType.Function;
	} else {
	token.TokenType = TokenType.Unrecognized;
	}
	} else {
	token.TokenType = TokenType.Unrecognized;
	}
	} else if ((token.TokenType == TokenType.Operator \|\| token.TokenType == TokenType.Negator)
	&& i < context.Result.Count - 1
	&& (token.Value == "+" \|\| token.Value == "-")) {
	if (i > 0
	&& token.Value
	== "+") //Remove any + with an opening parenthesis before.
	{
	if (context.Result[i - 1].TokenType == TokenType.OpeningParenthesis) {
	context.Result.RemoveAt(i);
	SetNegatorOperator(context, i, tokens);
	i--;
	continue;
	}
	}

	var nextToken = context.Result[i + 1];
	if (nextToken.TokenType == TokenType.Operator \|\| nextToken.TokenType == TokenType.Negator) {
	if (token.Value == "+" && (nextToken.Value == "+" \|\| nextToken.Value == "-")) {
	//Remove first
	context.Result.RemoveAt(i);
	SetNegatorOperator(context, i, tokens);
	i--;
	} else if (token.Value == "-" && nextToken.Value == "+") {
	//Remove second
	context.Result.RemoveAt(i + 1);
	SetNegatorOperator(context, i, tokens);
	i--;
	} else if (token.Value == "-" && nextToken.Value == "-") {
	//Remove first and set operator to +
	context.Result.RemoveAt(i);
	if (i == 0) {
	context.Result.RemoveAt(i + 1);
	i += 2;
	} else {
	//context.Result[i].TokenType = TokenType.Operator;
	//context.Result[i].Value = "+";
	context.Result[i] = tokens["+"];
	SetNegatorOperator(context, i, tokens);
	i--;
	}
	}
	}
	}
	}
	}

	private static void SetNegatorOperator(
	TokenizerContext context,
	int i,
	IDictionary<string, Token> tokens) {
	if (context.Result[i].Value == "-"
	&& i > 0
	&& (context.Result[i].TokenType == TokenType.Operator
	\|\| context.Result[i].TokenType == TokenType.Negator)) {
	if (TokenIsNegator(context.Result[i - 1])) {
	context.Result[i] = new("-", TokenType.Negator);
	} else {
	context.Result[i] = tokens["-"];
	}
	}
	}

	private static bool TokenIsNegator(TokenizerContext context) {
	return TokenIsNegator(context.LastToken);
	}

	private static bool TokenIsNegator(Token t) {
	return t == null
	\|\| t.TokenType == TokenType.Operator
	\|\| t.TokenType == TokenType.OpeningParenthesis
	\|\| t.TokenType == TokenType.Comma
	\|\| t.TokenType == TokenType.SemiColon
	\|\| t.TokenType == TokenType.OpeningEnumerable;
	}

	private bool IsPartOfMultipleCharSeparator(TokenizerContext context, char c) {
	var lastToken = context.LastToken != null ? context.LastToken.Value : string.Empty;
	return _tokenProvider.IsOperator(lastToken)
	&& _tokenProvider.IsPossibleLastPartOfMultipleCharOperator(
	c.ToString(CultureInfo.InvariantCulture))
	&& !context.CurrentTokenHasValue;
	}

	private Token CreateToken(TokenizerContext context, string worksheet) {
	if (context.CurrentToken == "-") {
	if (context.LastToken == null && context.LastToken.TokenType == TokenType.Operator) {
	return new("-", TokenType.Negator);
	}
	}
	return _tokenFactory.Create(context.Result, context.CurrentToken, worksheet);
	}

	private bool CharIsTokenSeparator(char c, out Token token) {
	var result = _tokenProvider.Tokens.ContainsKey(c.ToString());
	token = result ? token = _tokenProvider.Tokens[c.ToString()] : null;
	return result;
	}
	}