Skip to main content
New solution, with list of improvements
Source Link
Alain
  • 472
  • 1
  • 4
  • 17

Here's a correctionthe most performant version I've got so far based on a remarksuggestions by @202_accepted -@PieterWitvoet and 202_accepted. It also corrects the overflow detection was previously brokenbug from my OP.

/// <summary>High performance integer parser with rudimentary flexibility.</summary>
/// <remarks>Supports negative numbers, but no whitespace or other non-digit characters
/// may be present.
/// Will not parse strings with more than 10 numeric characters,
/// even if there are leading zeros (such that the integer does not overflow).</remarks>
public static unsafe bool FastTryParseInt(string input, out int result)
{
    // We never expect null, but enforcing this may enable some JIT optimizations.
    if (input == null)
    {
        result = default(int);
        return false;
    }
    fixed (char* cString = input)
    {
        unchecked
        {
            char* nextChar = cString;
            bool isNegative = false;
            // Check whether the first character is numeric
            if (*nextChar ==< '0' || *nextChar > '9')
            {
                // Only allow a negative sign at the beginning of the string.
                if (*nextChar != CharNegative)
                {
                    result = default(int);
                    return false;
                }
                isNegative = true;
                nextChar++;// Any other non-numeric characters after this is an error.
            }    if (*++nextChar < '0' || *nextChar > '9')
                {
                    result = 0;default(int);
                    // Special Case: Excel has been known to format zero as "-".
                    // So return true here IFF this non-digit char is the end-of-string.
                    return *nextChar == Char.MinValue;
                }
            }
            // Now process each character of the string
            int localValue = *nextChar++ - '0';
            while (*nextChar >= '0' && *nextChar <= '9')
                resultlocalValue = resultlocalValue * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) 
 return false;          {
            long ptrLen = nextChar -result cString;= default(int);
            if (isNegative) {  return false;
            }

      result      // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar -result; cString;
            // Result and overflow logic is different if (ptrLenthere <was 11L)a returnminus true;sign.
            if (isNegative)
   if (ptrLen > 11L) return false;    {
                nextCharresult = cString-localValue;
 + 1;
            }  // Longest possible negative int is 11 chars (-2147483648)
            else {   // Less than 11 characters (including negative) is no risk of overflow
                if (ptrLen < 10L11L) return true;
                // More than 11 characters is definitely an overflow.
                if (ptrLen > 10L11L) return false;
                nextChar// =Exactly cString;
11 characters needs to be checked for overflow.
     }
           // returnNeat *nextCharTrick: <An '2'overflow ||will *nextCharalways ==result '2'in &&the first digit changing.
                return *(*++nextCharcString <+ '1'1) ||- *nextChar'0' == '1'localValue &&/ 1000000000
                (*++nextChar < '4' || *nextChar// ==Special '4'case, &&
parsing 2147483648 overflows to -2147483648, but this
          (*++nextChar < '7' || *nextChar == '7' &&
   // value should be supported if there was a leading minus sign.
  (*++nextChar < '4' || *nextChar == '4' &&
           || localValue == Int32.MinValue;
  (*++nextChar < '8' || *nextChar == '8' &&   }
            // Same logic if (*++nextCharpositive, <but '3'one ||fewer *nextCharcharacters ==is '3'allowed &&
(no minus sign)
            result = (*++nextCharlocalValue;
 < '6' || *nextChar == '6' &&
     if (ptrLen < 10L) return true;
      (*++nextChar < '4' || *nextChar == '4'if &&
(ptrLen > 10L) return false;
            *++nextCharreturn <=*cString (isNegative- ?'0' '8'== :localValue '7')))))))));/ 1000000000;
        }
    }
}

With commentsBenchmarks

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

I was seeing a lot of variability in my benchmark runs when I looped over a billion strings at once, so to mange noise, I changed my test code to run several smaller test in succession. Here are the results for the above code:

In the majority case (less than 11/10 characters) there are no additional ops. Otherwise, for a number close to the limit, it takes 2 additional ops, plus 2 ops per significant digit that matches the limit. (worst case 20 additional ops to parse max/min value). As such, the performance benchmarks are the same as above (within +/-5% on any given run)

Native parser took 1202 ms. Custom parser took 181 ms. Performance gain was 564.09% Native parser took 1211 ms. Custom parser took 177 ms. Performance gain was 584.18% Native parser took 1226 ms. Custom parser took 241 ms. Performance gain was 408.71% Native parser took 1315 ms. Custom parser took 180 ms. Performance gain was 630.56% Native parser took 1402 ms. Custom parser took 182 ms. Performance gain was 670.33% Native parser took 1212 ms. Custom parser took 181 ms. Performance gain was 569.61% Native parser took 1221 ms. Custom parser took 178 ms. Performance gain was 585.96% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1220 ms. Custom parser took 179 ms. Performance gain was 581.56%

I could make the code more compact by storing the digits of IntThe average is about 575% faster than native parse.Min/Max value

Performance Improvements

  • Uses unsafe and fixed to treat the string as a null-terminated character array, avoiding the need to monitor or pre-compute the length of the string as we traverse it.

  • Initializes the local value directly using the first numeric digit, avoiding a superfluous multiplication and addition with zero in the first loop.

  • A null check at the beginning may enable some JIT optimizations.

  • Uses a local integer value for accumulation during parsing, and only assigns out result once - since manipulating out variables directly is more expensive.

  • Corrected the overflow check. In the majority case (less than 11/10 characters) there are no additional ops for overflow detection. Otherwise, for a number close to the limit, it takes just a couple additional ops to check whether the first digit changed due to overflow.

Omissions

  • Still does not account for leading zeros in overflow detection. Doing so slows down the expected case too much.

  • Still does not allow white-space, thousand separators, a leading positive sign, or trailing signs.

Note that in all above 'unhandled' cases, we're very careful to return false - we will never return true with an arrayincorrect result.

Note that you could replace all instances of return false with return Int.TryParse(input, out result) if you wanted to "fall-back" to the native integer parser in these rare cases to strike a balance between performance and iterating over itflexibility. In our case, but that slowssomething similar is done further up the chain, so I haven't included it downin this code.

Here's a correction based on a remark by @202_accepted - overflow detection was previously broken.

public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) {
        unchecked {
            char* nextChar = cString;
            bool isNegative = false;
            if (*nextChar == CharNegative) {
                isNegative = true;
                nextChar++;
            }
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            if (*nextChar != Char.MinValue) return false;
            long ptrLen = nextChar - cString;
            if (isNegative) {
                result = -result;
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                nextChar = cString + 1;
            }
            else {
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                nextChar = cString;
            }
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

With comments

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

In the majority case (less than 11/10 characters) there are no additional ops. Otherwise, for a number close to the limit, it takes 2 additional ops, plus 2 ops per significant digit that matches the limit. (worst case 20 additional ops to parse max/min value). As such, the performance benchmarks are the same as above (within +/-5% on any given run)

I could make the code more compact by storing the digits of Int.Min/Max value in an array and iterating over it, but that slows it down.

Here's the most performant version I've got so far based on suggestions by @PieterWitvoet and 202_accepted. It also corrects the overflow bug from my OP.

/// <summary>High performance integer parser with rudimentary flexibility.</summary>
/// <remarks>Supports negative numbers, but no whitespace or other non-digit characters
/// may be present.
/// Will not parse strings with more than 10 numeric characters,
/// even if there are leading zeros (such that the integer does not overflow).</remarks>
public static unsafe bool FastTryParseInt(string input, out int result)
{
    // We never expect null, but enforcing this may enable some JIT optimizations.
    if (input == null)
    {
        result = default(int);
        return false;
    }
    fixed (char* cString = input)
    {
        unchecked
        {
            char* nextChar = cString;
            bool isNegative = false;
            // Check whether the first character is numeric
            if (*nextChar < '0' || *nextChar > '9')
            {
                // Only allow a negative sign at the beginning of the string.
                if (*nextChar != CharNegative)
                {
                    result = default(int);
                    return false;
                }
                isNegative = true;
                // Any other non-numeric characters after this is an error.
                if (*++nextChar < '0' || *nextChar > '9')
                {
                    result = default(int);
                    // Special Case: Excel has been known to format zero as "-".
                    // So return true here IFF this non-digit char is the end-of-string.
                    return *nextChar == Char.MinValue;
                }
            }
            // Now process each character of the string
            int localValue = *nextChar++ - '0';
            while (*nextChar >= '0' && *nextChar <= '9')
                localValue = localValue * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) 
            {
                result = default(int);
                return false;
            }

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -localValue;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                if (ptrLen < 11L) return true;
                // More than 11 characters is definitely an overflow.
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be checked for overflow.
                // Neat Trick: An overflow will always result in the first digit changing.
                return *(cString + 1) - '0' == localValue / 1000000000
                    // Special case, parsing 2147483648 overflows to -2147483648, but this
                    // value should be supported if there was a leading minus sign.
                    || localValue == Int32.MinValue;
            }
            // Same logic if positive, but one fewer characters is allowed (no minus sign)
            result = localValue;
            if (ptrLen < 10L) return true;
            if (ptrLen > 10L) return false;
            return *cString - '0' == localValue / 1000000000;
        }
    }
}

Benchmarks

I was seeing a lot of variability in my benchmark runs when I looped over a billion strings at once, so to mange noise, I changed my test code to run several smaller test in succession. Here are the results for the above code:

Native parser took 1202 ms. Custom parser took 181 ms. Performance gain was 564.09% Native parser took 1211 ms. Custom parser took 177 ms. Performance gain was 584.18% Native parser took 1226 ms. Custom parser took 241 ms. Performance gain was 408.71% Native parser took 1315 ms. Custom parser took 180 ms. Performance gain was 630.56% Native parser took 1402 ms. Custom parser took 182 ms. Performance gain was 670.33% Native parser took 1212 ms. Custom parser took 181 ms. Performance gain was 569.61% Native parser took 1221 ms. Custom parser took 178 ms. Performance gain was 585.96% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1220 ms. Custom parser took 179 ms. Performance gain was 581.56%

The average is about 575% faster than native parse.

Performance Improvements

  • Uses unsafe and fixed to treat the string as a null-terminated character array, avoiding the need to monitor or pre-compute the length of the string as we traverse it.

  • Initializes the local value directly using the first numeric digit, avoiding a superfluous multiplication and addition with zero in the first loop.

  • A null check at the beginning may enable some JIT optimizations.

  • Uses a local integer value for accumulation during parsing, and only assigns out result once - since manipulating out variables directly is more expensive.

  • Corrected the overflow check. In the majority case (less than 11/10 characters) there are no additional ops for overflow detection. Otherwise, for a number close to the limit, it takes just a couple additional ops to check whether the first digit changed due to overflow.

Omissions

  • Still does not account for leading zeros in overflow detection. Doing so slows down the expected case too much.

  • Still does not allow white-space, thousand separators, a leading positive sign, or trailing signs.

Note that in all above 'unhandled' cases, we're very careful to return false - we will never return true with an incorrect result.

Note that you could replace all instances of return false with return Int.TryParse(input, out result) if you wanted to "fall-back" to the native integer parser in these rare cases to strike a balance between performance and flexibility. In our case, something similar is done further up the chain, so I haven't included it in this code.

Source Link
Alain
  • 472
  • 1
  • 4
  • 17

Here's a correction based on a remark by @202_accepted - overflow detection was previously broken.

public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) {
        unchecked {
            char* nextChar = cString;
            bool isNegative = false;
            if (*nextChar == CharNegative) {
                isNegative = true;
                nextChar++;
            }
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            if (*nextChar != Char.MinValue) return false;
            long ptrLen = nextChar - cString;
            if (isNegative) {
                result = -result;
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                nextChar = cString + 1;
            }
            else {
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                nextChar = cString;
            }
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

With comments

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

In the majority case (less than 11/10 characters) there are no additional ops. Otherwise, for a number close to the limit, it takes 2 additional ops, plus 2 ops per significant digit that matches the limit. (worst case 20 additional ops to parse max/min value). As such, the performance benchmarks are the same as above (within +/-5% on any given run)

I could make the code more compact by storing the digits of Int.Min/Max value in an array and iterating over it, but that slows it down.