Return to Answer

New solution, with list of improvements

Source Link

edited Aug 9, 2018 at 13:56

Alain

Here's a correctionthe most performant version I've got so far based on a remarksuggestions by @202_accepted -@PieterWitvoet and 202_accepted. It also corrects the overflow detection was previously brokenbug from my OP.

/// <summary>High performance integer parser with rudimentary flexibility.</summary>
/// <remarks>Supports negative numbers, but no whitespace or other non-digit characters
/// may be present.
/// Will not parse strings with more than 10 numeric characters,
/// even if there are leading zeros (such that the integer does not overflow).</remarks>
public static unsafe bool FastTryParseInt(string input, out int result)
{
    // We never expect null, but enforcing this may enable some JIT optimizations.
    if (input == null)
    {
        result = default(int);
        return false;
    }
    fixed (char* cString = input)
    {
        unchecked
        {
            char* nextChar = cString;
            bool isNegative = false;
            // Check whether the first character is numeric
            if (*nextChar ==< '0' || *nextChar > '9')
            {
                // Only allow a negative sign at the beginning of the string.
                if (*nextChar != CharNegative)
                {
                    result = default(int);
                    return false;
                }
                isNegative = true;
                nextChar++;// Any other non-numeric characters after this is an error.
            }    if (*++nextChar < '0' || *nextChar > '9')
                {
                    result = 0;default(int);
                    // Special Case: Excel has been known to format zero as "-".
                    // So return true here IFF this non-digit char is the end-of-string.
                    return *nextChar == Char.MinValue;
                }
            }
            // Now process each character of the string
            int localValue = *nextChar++ - '0';
            while (*nextChar >= '0' && *nextChar <= '9')
                resultlocalValue = resultlocalValue * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) 
 return false;          {
            long ptrLen = nextChar -result cString;= default(int);
            if (isNegative) {  return false;
            }

      result      // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar -result; cString;
            // Result and overflow logic is different if (ptrLenthere <was 11L)a returnminus true;sign.
            if (isNegative)
   if (ptrLen > 11L) return false;    {
                nextCharresult = cString-localValue;
 + 1;
            }  // Longest possible negative int is 11 chars (-2147483648)
            else {   // Less than 11 characters (including negative) is no risk of overflow
                if (ptrLen < 10L11L) return true;
                // More than 11 characters is definitely an overflow.
                if (ptrLen > 10L11L) return false;
                nextChar// =Exactly cString;
11 characters needs to be checked for overflow.
     }
           // returnNeat *nextCharTrick: <An '2'overflow ||will *nextCharalways ==result '2'in &&the first digit changing.
                return *(*++nextCharcString <+ '1'1) ||- *nextChar'0' == '1'localValue &&/ 1000000000
                (*++nextChar < '4' || *nextChar// ==Special '4'case, &&
parsing 2147483648 overflows to -2147483648, but this
          (*++nextChar < '7' || *nextChar == '7' &&
   // value should be supported if there was a leading minus sign.
  (*++nextChar < '4' || *nextChar == '4' &&
           || localValue == Int32.MinValue;
  (*++nextChar < '8' || *nextChar == '8' &&   }
            // Same logic if (*++nextCharpositive, <but '3'one ||fewer *nextCharcharacters ==is '3'allowed &&
(no minus sign)
            result = (*++nextCharlocalValue;
 < '6' || *nextChar == '6' &&
     if (ptrLen < 10L) return true;
      (*++nextChar < '4' || *nextChar == '4'if &&
(ptrLen > 10L) return false;
            *++nextCharreturn <=*cString (isNegative- ?'0' '8'== :localValue '7')))))))));/ 1000000000;
        }
    }
}

With commentsBenchmarks

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

I was seeing a lot of variability in my benchmark runs when I looped over a billion strings at once, so to mange noise, I changed my test code to run several smaller test in succession. Here are the results for the above code:

In the majority case (less than 11/10 characters) there are no additional ops. Otherwise, for a number close to the limit, it takes 2 additional ops, plus 2 ops per significant digit that matches the limit. (worst case 20 additional ops to parse max/min value). As such, the performance benchmarks are the same as above (within +/-5% on any given run)

Native parser took 1202 ms. Custom parser took 181 ms. Performance gain was 564.09% Native parser took 1211 ms. Custom parser took 177 ms. Performance gain was 584.18% Native parser took 1226 ms. Custom parser took 241 ms. Performance gain was 408.71% Native parser took 1315 ms. Custom parser took 180 ms. Performance gain was 630.56% Native parser took 1402 ms. Custom parser took 182 ms. Performance gain was 670.33% Native parser took 1212 ms. Custom parser took 181 ms. Performance gain was 569.61% Native parser took 1221 ms. Custom parser took 178 ms. Performance gain was 585.96% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1220 ms. Custom parser took 179 ms. Performance gain was 581.56%

I could make the code more compact by storing the digits of IntThe average is about 575% faster than native parse.Min/Max value

Performance Improvements

Uses unsafe and fixed to treat the string as a null-terminated character array, avoiding the need to monitor or pre-compute the length of the string as we traverse it.

Initializes the local value directly using the first numeric digit, avoiding a superfluous multiplication and addition with zero in the first loop.

A null check at the beginning may enable some JIT optimizations.

Uses a local integer value for accumulation during parsing, and only assigns out result once - since manipulating out variables directly is more expensive.

Corrected the overflow check. In the majority case (less than 11/10 characters) there are no additional ops for overflow detection. Otherwise, for a number close to the limit, it takes just a couple additional ops to check whether the first digit changed due to overflow.

Omissions

Still does not account for leading zeros in overflow detection. Doing so slows down the expected case too much.

Still does not allow white-space, thousand separators, a leading positive sign, or trailing signs.

Note that in all above 'unhandled' cases, we're very careful to return false - we will never return true with an arrayincorrect result.

Note that you could replace all instances of return false with return Int.TryParse(input, out result) if you wanted to "fall-back" to the native integer parser in these rare cases to strike a balance between performance and iterating over itflexibility. In our case, but that slowssomething similar is done further up the chain, so I haven't included it downin this code.

Here's a correction based on a remark by @202_accepted - overflow detection was previously broken.

public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) {
        unchecked {
            char* nextChar = cString;
            bool isNegative = false;
            if (*nextChar == CharNegative) {
                isNegative = true;
                nextChar++;
            }
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            if (*nextChar != Char.MinValue) return false;
            long ptrLen = nextChar - cString;
            if (isNegative) {
                result = -result;
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                nextChar = cString + 1;
            }
            else {
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                nextChar = cString;
            }
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

With comments

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

I could make the code more compact by storing the digits of Int.Min/Max value in an array and iterating over it, but that slows it down.

Here's the most performant version I've got so far based on suggestions by @PieterWitvoet and 202_accepted. It also corrects the overflow bug from my OP.

/// <summary>High performance integer parser with rudimentary flexibility.</summary>
/// <remarks>Supports negative numbers, but no whitespace or other non-digit characters
/// may be present.
/// Will not parse strings with more than 10 numeric characters,
/// even if there are leading zeros (such that the integer does not overflow).</remarks>
public static unsafe bool FastTryParseInt(string input, out int result)
{
    // We never expect null, but enforcing this may enable some JIT optimizations.
    if (input == null)
    {
        result = default(int);
        return false;
    }
    fixed (char* cString = input)
    {
        unchecked
        {
            char* nextChar = cString;
            bool isNegative = false;
            // Check whether the first character is numeric
            if (*nextChar < '0' || *nextChar > '9')
            {
                // Only allow a negative sign at the beginning of the string.
                if (*nextChar != CharNegative)
                {
                    result = default(int);
                    return false;
                }
                isNegative = true;
                // Any other non-numeric characters after this is an error.
                if (*++nextChar < '0' || *nextChar > '9')
                {
                    result = default(int);
                    // Special Case: Excel has been known to format zero as "-".
                    // So return true here IFF this non-digit char is the end-of-string.
                    return *nextChar == Char.MinValue;
                }
            }
            // Now process each character of the string
            int localValue = *nextChar++ - '0';
            while (*nextChar >= '0' && *nextChar <= '9')
                localValue = localValue * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) 
            {
                result = default(int);
                return false;
            }

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -localValue;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                if (ptrLen < 11L) return true;
                // More than 11 characters is definitely an overflow.
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be checked for overflow.
                // Neat Trick: An overflow will always result in the first digit changing.
                return *(cString + 1) - '0' == localValue / 1000000000
                    // Special case, parsing 2147483648 overflows to -2147483648, but this
                    // value should be supported if there was a leading minus sign.
                    || localValue == Int32.MinValue;
            }
            // Same logic if positive, but one fewer characters is allowed (no minus sign)
            result = localValue;
            if (ptrLen < 10L) return true;
            if (ptrLen > 10L) return false;
            return *cString - '0' == localValue / 1000000000;
        }
    }
}

Benchmarks

Native parser took 1202 ms. Custom parser took 181 ms. Performance gain was 564.09% Native parser took 1211 ms. Custom parser took 177 ms. Performance gain was 584.18% Native parser took 1226 ms. Custom parser took 241 ms. Performance gain was 408.71% Native parser took 1315 ms. Custom parser took 180 ms. Performance gain was 630.56% Native parser took 1402 ms. Custom parser took 182 ms. Performance gain was 670.33% Native parser took 1212 ms. Custom parser took 181 ms. Performance gain was 569.61% Native parser took 1221 ms. Custom parser took 178 ms. Performance gain was 585.96% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1203 ms. Custom parser took 178 ms. Performance gain was 575.84% Native parser took 1220 ms. Custom parser took 179 ms. Performance gain was 581.56%

The average is about 575% faster than native parse.

Performance Improvements

Uses unsafe and fixed to treat the string as a null-terminated character array, avoiding the need to monitor or pre-compute the length of the string as we traverse it.

Initializes the local value directly using the first numeric digit, avoiding a superfluous multiplication and addition with zero in the first loop.

A null check at the beginning may enable some JIT optimizations.

Uses a local integer value for accumulation during parsing, and only assigns out result once - since manipulating out variables directly is more expensive.

Corrected the overflow check. In the majority case (less than 11/10 characters) there are no additional ops for overflow detection. Otherwise, for a number close to the limit, it takes just a couple additional ops to check whether the first digit changed due to overflow.

Omissions

Still does not account for leading zeros in overflow detection. Doing so slows down the expected case too much.

Still does not allow white-space, thousand separators, a leading positive sign, or trailing signs.

Note that in all above 'unhandled' cases, we're very careful to return false - we will never return true with an incorrect result.

Note that you could replace all instances of return false with return Int.TryParse(input, out result) if you wanted to "fall-back" to the native integer parser in these rare cases to strike a balance between performance and flexibility. In our case, something similar is done further up the chain, so I haven't included it in this code.

Source Link

answered Aug 7, 2018 at 15:27

Alain

Here's a correction based on a remark by @202_accepted - overflow detection was previously broken.

public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) {
        unchecked {
            char* nextChar = cString;
            bool isNegative = false;
            if (*nextChar == CharNegative) {
                isNegative = true;
                nextChar++;
            }
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            if (*nextChar != Char.MinValue) return false;
            long ptrLen = nextChar - cString;
            if (isNegative) {
                result = -result;
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                nextChar = cString + 1;
            }
            else {
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                nextChar = cString;
            }
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

With comments

// High performance integer parser with rudimentary flexibility.
// Supports leading negatives, but no white-space or other non-numeric characters.
public static unsafe bool FastTryParseInt(string input, out int result)
{
    fixed (char* cString = input) { // Allows pointer math
        unchecked { // Allows faster integer math by not testing for overflows
            char* nextChar = cString;
            bool isNegative = false;
            // Handle a possible negative sign at the beginning of the string.
            if (*nextChar == CharNegative)
            {
                isNegative = true;
                nextChar++;
            }
            // Now process each character of the string
            result = 0;
            while (*nextChar >= '0' && *nextChar <= '9')
                result = result * 10 + (*nextChar++ - '0');
            // If the non-numeric character encountered to end the while loop
            // wasn't the null terminator, the string is invalid.
            if (*nextChar != Char.MinValue) return false;

            // We need to check for an integer overflow based on the length of the string
            long ptrLen = nextChar - cString;
            // Result and overflow logic is different if there was a minus sign.
            if (isNegative)
            {
                result = -result;
                // Longest possible negative int is 11 chars (-2147483648)
                // Less than 11 characters (including negative) is no risk of overflow
                // More than 11 characters is definitely an overflow.
                if (ptrLen < 11L) return true;
                if (ptrLen > 11L) return false;
                // Exactly 11 characters needs to be more carefully checked for overflow.
                // Go to after the negative sign and continue on to overflow logic.
                nextChar = cString + 1;
            }
            else
            {
                // Same logic, but one fewer characters is allowed (there's no minus sign)
                if (ptrLen < 10L) return true;
                if (ptrLen > 10L) return false;
                // Go to the beginning of the string and continue on to overflow logic.
                nextChar = cString;
            }
            // Expensive (but relatively rare) check for overflow.
            // Traverse the string until we can be sure it will or will not overflow.
            // Note: Int.MaxValue is +2147483647 and Int.MinValue is -2147483648.
            return *nextChar < '2' || *nextChar == '2' &&
                (*++nextChar < '1' || *nextChar == '1' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '7' || *nextChar == '7' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                (*++nextChar < '8' || *nextChar == '8' &&
                (*++nextChar < '3' || *nextChar == '3' &&
                (*++nextChar < '6' || *nextChar == '6' &&
                (*++nextChar < '4' || *nextChar == '4' &&
                *++nextChar <= (isNegative ? '8' : '7')))))))));
        }
    }
}

I could make the code more compact by storing the digits of Int.Min/Max value in an array and iterating over it, but that slows it down.