Skip to main content
Added support for leading spaces feedback from @JanDotNet
Source Link
forsvarir
  • 11.8k
  • 7
  • 39
  • 72
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MudCore
{
    public static class CommandTokenizer
    {
        static Regex _pattern;

        static CommandTokenizer()
        {
            _pattern = new Regex(@"((""\s*""(?<token>[^""]*)(""|$)\s*)|(\s*(?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);
 
        }

        public static string[] Tokenise(string input)
        {
            List<string> matches = new List<string>();
            var match = _pattern.Match(input);

            if(match.Success)
            {
                    foreach(Capture capture in match.Groups["token"].Captures)
                    {
                        matches.Add(capture.Value);
                    }
            }
            return matches.ToArray();
        }
    }
}
using MudCore;
using NUnit.Framework;

namespace MudCoreTests
{
    [TestFixture]
    public class CommandTokenizerTests
    {
        [Test]
        public void SingleWordBecomesSingleToken()
        {
            var tokens = CommandTokenizer.Tokenise("single");
            Assert.AreEqual(1, tokens.Length);
            Assert.AreEqual("single", tokens[0]);
        }

        [Test]
        public void MultipleWordsReturnMultipleTokens()
        {
            var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [Test]
        public void LeadingSpacesIgnored()
        {
            var tokens = CommandTokenizer.Tokenise(" there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [TestCase("There are \"some quoted tokens\" in the text", 
                  new string[] {
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "quoted in middle")]
        [TestCase("\"some quoted tokens\" There are in the text", 
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "quoted at start")]
        [TestCase(" \"some quoted tokens\" There are in the text",
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "space then quoted at start")]
        [TestCase("There are in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "quoted at end")]
        [TestCase("There \"are\" in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "multiple quotes")]
        [TestCase("There are in the text \"some quoted tokens, that have punctionation.  And other stuff\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens, that have punctionation.  And other stuff" }, "punctuation in quote")]
        [TestCase("There are, in the text \"some quoted tokens\".", 
                  new string[] {
                      "There",
                      "are,",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens",
                      "." }, "punctuation outside of quotes")]
        [TestCase("; There are \"some quoted tokens\" in the text", 
                  new string[] {
                      ";",
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "semi-colon recognised")]
        [TestCase("\"Outer quote\" nested quote \"back out\" really out", 
                  new string[] {
                      "Outer quote",
                      "nested",
                      "quote",
                      "back out",
                      "really",
                      "out" }, "nested quote")]
        [TestCase("Mismatched quotes \"are ignored",
                  new string[]
                  {
                      "Mismatched",
                      "quotes",
                      "are ignored"}, "unclosed quotes run to end of line")]
        public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
        {
            var tokens = CommandTokenizer.Tokenise(inputText);

            Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
            for (var i = 0; i < expectedTokens.Length; i++)
            {
                Assert.AreEqual(expectedTokens[i], tokens[i], testName);
            }
        }

    }
}
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MudCore
{
    public static class CommandTokenizer
    {
        static Regex _pattern;

        static CommandTokenizer()
        {
            _pattern = new Regex(@"((""(?<token>[^""]*)(""|$)\s*)|((?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);
 
        }

        public static string[] Tokenise(string input)
        {
            List<string> matches = new List<string>();
            var match = _pattern.Match(input);

            if(match.Success)
            {
                    foreach(Capture capture in match.Groups["token"].Captures)
                    {
                        matches.Add(capture.Value);
                    }
            }
            return matches.ToArray();
        }
    }
}
using MudCore;
using NUnit.Framework;

namespace MudCoreTests
{
    [TestFixture]
    public class CommandTokenizerTests
    {
        [Test]
        public void SingleWordBecomesSingleToken()
        {
            var tokens = CommandTokenizer.Tokenise("single");
            Assert.AreEqual(1, tokens.Length);
            Assert.AreEqual("single", tokens[0]);
        }

        [Test]
        public void MultipleWordsReturnMultipleTokens()
        {
            var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [TestCase("There are \"some quoted tokens\" in the text", 
                  new string[] {
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "quoted in middle")]
        [TestCase("\"some quoted tokens\" There are in the text", 
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "quoted at start")]
        [TestCase("There are in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "quoted at end")]
        [TestCase("There \"are\" in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "multiple quotes")]
        [TestCase("There are in the text \"some quoted tokens, that have punctionation.  And other stuff\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens, that have punctionation.  And other stuff" }, "punctuation in quote")]
        [TestCase("There are, in the text \"some quoted tokens\".", 
                  new string[] {
                      "There",
                      "are,",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens",
                      "." }, "punctuation outside of quotes")]
        [TestCase("; There are \"some quoted tokens\" in the text", 
                  new string[] {
                      ";",
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "semi-colon recognised")]
        [TestCase("\"Outer quote\" nested quote \"back out\" really out", 
                  new string[] {
                      "Outer quote",
                      "nested",
                      "quote",
                      "back out",
                      "really",
                      "out" }, "nested quote")]
        [TestCase("Mismatched quotes \"are ignored",
                  new string[]
                  {
                      "Mismatched",
                      "quotes",
                      "are ignored"}, "unclosed quotes run to end of line")]
        public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
        {
            var tokens = CommandTokenizer.Tokenise(inputText);

            Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
            for (var i = 0; i < expectedTokens.Length; i++)
            {
                Assert.AreEqual(expectedTokens[i], tokens[i], testName);
            }
        }

    }
}
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MudCore
{
    public static class CommandTokenizer
    {
        static Regex _pattern;

        static CommandTokenizer()
        {
            _pattern = new Regex(@"((\s*""(?<token>[^""]*)(""|$)\s*)|(\s*(?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);
        }

        public static string[] Tokenise(string input)
        {
            List<string> matches = new List<string>();
            var match = _pattern.Match(input);

            if(match.Success)
            {
                    foreach(Capture capture in match.Groups["token"].Captures)
                    {
                        matches.Add(capture.Value);
                    }
            }
            return matches.ToArray();
        }
    }
}
using MudCore;
using NUnit.Framework;

namespace MudCoreTests
{
    [TestFixture]
    public class CommandTokenizerTests
    {
        [Test]
        public void SingleWordBecomesSingleToken()
        {
            var tokens = CommandTokenizer.Tokenise("single");
            Assert.AreEqual(1, tokens.Length);
            Assert.AreEqual("single", tokens[0]);
        }

        [Test]
        public void MultipleWordsReturnMultipleTokens()
        {
            var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [Test]
        public void LeadingSpacesIgnored()
        {
            var tokens = CommandTokenizer.Tokenise(" there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [TestCase("There are \"some quoted tokens\" in the text", 
                  new string[] {
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "quoted in middle")]
        [TestCase("\"some quoted tokens\" There are in the text", 
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "quoted at start")]
        [TestCase(" \"some quoted tokens\" There are in the text",
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "space then quoted at start")]
        [TestCase("There are in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "quoted at end")]
        [TestCase("There \"are\" in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "multiple quotes")]
        [TestCase("There are in the text \"some quoted tokens, that have punctionation.  And other stuff\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens, that have punctionation.  And other stuff" }, "punctuation in quote")]
        [TestCase("There are, in the text \"some quoted tokens\".", 
                  new string[] {
                      "There",
                      "are,",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens",
                      "." }, "punctuation outside of quotes")]
        [TestCase("; There are \"some quoted tokens\" in the text", 
                  new string[] {
                      ";",
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "semi-colon recognised")]
        [TestCase("\"Outer quote\" nested quote \"back out\" really out", 
                  new string[] {
                      "Outer quote",
                      "nested",
                      "quote",
                      "back out",
                      "really",
                      "out" }, "nested quote")]
        [TestCase("Mismatched quotes \"are ignored",
                  new string[]
                  {
                      "Mismatched",
                      "quotes",
                      "are ignored"}, "unclosed quotes run to end of line")]
        public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
        {
            var tokens = CommandTokenizer.Tokenise(inputText);

            Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
            for (var i = 0; i < expectedTokens.Length; i++)
            {
                Assert.AreEqual(expectedTokens[i], tokens[i], testName);
            }
        }

    }
}
Tweeted twitter.com/StackCodeReview/status/806553493775458304
Source Link
forsvarir
  • 11.8k
  • 7
  • 39
  • 72

Command Tokenizer

I've written some code to tokenize a command string into its tokens.

A token is either:

  • A block of any non-whitespace characters
  • A block of characters, which may include whitespace, wrapped in quotes

So, for the input:

This is some text "with information" quoted.

I'd expect the tokens:

  • This
  • is
  • some
  • text
  • with information
  • quoted.

The tokenizer

using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MudCore
{
    public static class CommandTokenizer
    {
        static Regex _pattern;

        static CommandTokenizer()
        {
            _pattern = new Regex(@"((""(?<token>[^""]*)(""|$)\s*)|((?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);

        }

        public static string[] Tokenise(string input)
        {
            List<string> matches = new List<string>();
            var match = _pattern.Match(input);

            if(match.Success)
            {
                    foreach(Capture capture in match.Groups["token"].Captures)
                    {
                        matches.Add(capture.Value);
                    }
            }
            return matches.ToArray();
        }
    }
}

The Tests

using MudCore;
using NUnit.Framework;

namespace MudCoreTests
{
    [TestFixture]
    public class CommandTokenizerTests
    {
        [Test]
        public void SingleWordBecomesSingleToken()
        {
            var tokens = CommandTokenizer.Tokenise("single");
            Assert.AreEqual(1, tokens.Length);
            Assert.AreEqual("single", tokens[0]);
        }

        [Test]
        public void MultipleWordsReturnMultipleTokens()
        {
            var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [TestCase("There are \"some quoted tokens\" in the text", 
                  new string[] {
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "quoted in middle")]
        [TestCase("\"some quoted tokens\" There are in the text", 
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "quoted at start")]
        [TestCase("There are in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "quoted at end")]
        [TestCase("There \"are\" in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "multiple quotes")]
        [TestCase("There are in the text \"some quoted tokens, that have punctionation.  And other stuff\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens, that have punctionation.  And other stuff" }, "punctuation in quote")]
        [TestCase("There are, in the text \"some quoted tokens\".", 
                  new string[] {
                      "There",
                      "are,",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens",
                      "." }, "punctuation outside of quotes")]
        [TestCase("; There are \"some quoted tokens\" in the text", 
                  new string[] {
                      ";",
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "semi-colon recognised")]
        [TestCase("\"Outer quote\" nested quote \"back out\" really out", 
                  new string[] {
                      "Outer quote",
                      "nested",
                      "quote",
                      "back out",
                      "really",
                      "out" }, "nested quote")]
        [TestCase("Mismatched quotes \"are ignored",
                  new string[]
                  {
                      "Mismatched",
                      "quotes",
                      "are ignored"}, "unclosed quotes run to end of line")]
        public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
        {
            var tokens = CommandTokenizer.Tokenise(inputText);

            Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
            for (var i = 0; i < expectedTokens.Length; i++)
            {
                Assert.AreEqual(expectedTokens[i], tokens[i], testName);
            }
        }

    }
}

It seems like this should be a fairly common task, so maybe there's a better approach I've missed. Is the regex legible / am I missing any optimisations? Or of course, any other feedback's welcome.