Skip to content

Add writeVarintSve for aarch64 - retry#9603

Open
Nicoshev wants to merge 1 commit into
facebook:masterfrom
Nicoshev:export-D73513003
Open

Add writeVarintSve for aarch64 - retry#9603
Nicoshev wants to merge 1 commit into
facebook:masterfrom
Nicoshev:export-D73513003

Conversation

@Nicoshev

Copy link
Copy Markdown
Contributor

Summary:
Implemented an explicit SVE version of writeVarint.
Throughput for 64-bit types shows a ~15% improvement.
16-bit and 32-bit cases seem to show a small improvement as well.

All three functions are branch-free, their disassembly can be seen here: https://godbolt.org/z/jG5d8Wfe8

before:

bench_write(u16_any_branch_free) 110.66% 2.00us 500.10K
bench_write(u32_any_branch_free) 126.90% 2.00us 499.37K
bench_write(u64_any_branch_free) 193.56% 2.33us 429.37K
bench_write(u16_1b_branch_free) 99.562% 1.91us 522.97K
bench_write(u16_2b_branch_free) 114.92% 2.00us 500.59K
bench_write(u16_3b_branch_free) 111.66% 2.00us 500.99K
bench_write(u32_1b_branch_free) 97.918% 1.93us 518.38K
bench_write(u32_2b_branch_free) 113.76% 1.99us 502.29K
bench_write(u32_3b_branch_free) 111.14% 1.99us 503.03K
bench_write(u32_4b_branch_free) 115.72% 1.97us 507.52K
bench_write(u32_5b_branch_free) 122.05% 2.00us 498.82K
bench_write(u64_1b_branch_free) 99.089% 1.95us 511.71K
bench_write(u64_2b_branch_free) 90.484% 2.53us 396.00K
bench_write(u64_3b_branch_free) 93.335% 2.38us 419.63K
bench_write(u64_4b_branch_free) 100.61% 2.24us 446.86K
bench_write(u64_5b_branch_free) 123.18% 2.37us 421.24K
bench_write(u64_6b_branch_free) 120.10% 2.33us 429.84K
bench_write(u64_7b_branch_free) 144.69% 2.36us 423.79K
bench_write(u64_8b_branch_free) 149.44% 2.25us 443.92K
bench_write(u64_9b_branch_free) 174.37% 2.31us 433.60K
bench_write(u64_10b_branch_free) 176.81% 2.28us 438.61K
bench_write(exponential_1b_branch_free) 108.05% 1.91us 522.52K
bench_write(exponential_2b_branch_free) 118.34% 1.98us 504.37K
bench_write(exponential_3b_branch_free) 114.22% 1.99us 501.87K

after:

bench_write(u16_any_branch_free) 115.30% 1.97us 507.43K
bench_write(u32_any_branch_free) 130.06% 1.97us 508.40K
bench_write(u64_any_branch_free) 226.45% 1.96us 509.18K
bench_write(u16_1b_branch_free) 101.37% 1.84us 543.01K
bench_write(u16_2b_branch_free) 116.65% 1.97us 508.51K
bench_write(u16_3b_branch_free) 111.17% 1.96us 510.12K
bench_write(u32_1b_branch_free) 99.679% 1.93us 519.42K
bench_write(u32_2b_branch_free) 115.98% 1.98us 506.04K
bench_write(u32_3b_branch_free) 111.45% 1.98us 503.85K
bench_write(u32_4b_branch_free) 116.04% 1.95us 513.18K
bench_write(u32_5b_branch_free) 124.59% 1.97us 508.35K
bench_write(u64_1b_branch_free) 99.669% 1.91us 522.26K
bench_write(u64_2b_branch_free) 117.53% 1.93us 518.86K
bench_write(u64_3b_branch_free) 111.95% 1.95us 511.77K
bench_write(u64_4b_branch_free) 111.29% 1.98us 504.98K
bench_write(u64_5b_branch_free) 124.53% 1.96us 510.52K
bench_write(u64_6b_branch_free) 145.48% 1.90us 526.18K
bench_write(u64_7b_branch_free) 172.51% 1.97us 506.83K
bench_write(u64_8b_branch_free) 174.92% 1.95us 514.13K
bench_write(u64_9b_branch_free) 202.27% 1.97us 508.08K
bench_write(u64_10b_branch_free) 205.43% 1.96us 510.44K
bench_write(exponential_1b_branch_free) 105.67% 1.91us 523.63K
bench_write(exponential_2b_branch_free) 116.10% 1.95us 512.64K
bench_write(exponential_3b_branch_free) 119.08% 1.95us 513.34K

Reviewed By: embg

Differential Revision: D73513003

@facebook-github-bot

Copy link
Copy Markdown
Contributor

This pull request was exported from Phabricator. Differential Revision: D73513003

Nicoshev added a commit to Nicoshev/hhvm that referenced this pull request Apr 23, 2025
Summary:

Implemented an explicit SVE version of writeVarint.
Throughput for 64-bit types shows a ~15% improvement.
16-bit and 32-bit cases seem to show a small improvement as well.

All three functions are branch-free, their disassembly can be seen here: https://godbolt.org/z/jG5d8Wfe8

before:

bench_write(u16_any_branch_free)                110.66%     2.00us   500.10K
bench_write(u32_any_branch_free)                126.90%     2.00us   499.37K
bench_write(u64_any_branch_free)                193.56%     2.33us   429.37K
bench_write(u16_1b_branch_free)                 99.562%     1.91us   522.97K
bench_write(u16_2b_branch_free)                 114.92%     2.00us   500.59K
bench_write(u16_3b_branch_free)                 111.66%     2.00us   500.99K
bench_write(u32_1b_branch_free)                 97.918%     1.93us   518.38K
bench_write(u32_2b_branch_free)                 113.76%     1.99us   502.29K
bench_write(u32_3b_branch_free)                 111.14%     1.99us   503.03K
bench_write(u32_4b_branch_free)                 115.72%     1.97us   507.52K
bench_write(u32_5b_branch_free)                 122.05%     2.00us   498.82K
bench_write(u64_1b_branch_free)                 99.089%     1.95us   511.71K
bench_write(u64_2b_branch_free)                 90.484%     2.53us   396.00K
bench_write(u64_3b_branch_free)                 93.335%     2.38us   419.63K
bench_write(u64_4b_branch_free)                 100.61%     2.24us   446.86K
bench_write(u64_5b_branch_free)                 123.18%     2.37us   421.24K
bench_write(u64_6b_branch_free)                 120.10%     2.33us   429.84K
bench_write(u64_7b_branch_free)                 144.69%     2.36us   423.79K
bench_write(u64_8b_branch_free)                 149.44%     2.25us   443.92K
bench_write(u64_9b_branch_free)                 174.37%     2.31us   433.60K
bench_write(u64_10b_branch_free)                176.81%     2.28us   438.61K
bench_write(exponential_1b_branch_free)         108.05%     1.91us   522.52K
bench_write(exponential_2b_branch_free)         118.34%     1.98us   504.37K
bench_write(exponential_3b_branch_free)         114.22%     1.99us   501.87K

after:

bench_write(u16_any_branch_free)                115.30%     1.97us   507.43K
bench_write(u32_any_branch_free)                130.06%     1.97us   508.40K
bench_write(u64_any_branch_free)                226.45%     1.96us   509.18K
bench_write(u16_1b_branch_free)                 101.37%     1.84us   543.01K
bench_write(u16_2b_branch_free)                 116.65%     1.97us   508.51K
bench_write(u16_3b_branch_free)                 111.17%     1.96us   510.12K
bench_write(u32_1b_branch_free)                 99.679%     1.93us   519.42K
bench_write(u32_2b_branch_free)                 115.98%     1.98us   506.04K
bench_write(u32_3b_branch_free)                 111.45%     1.98us   503.85K
bench_write(u32_4b_branch_free)                 116.04%     1.95us   513.18K
bench_write(u32_5b_branch_free)                 124.59%     1.97us   508.35K
bench_write(u64_1b_branch_free)                 99.669%     1.91us   522.26K
bench_write(u64_2b_branch_free)                 117.53%     1.93us   518.86K
bench_write(u64_3b_branch_free)                 111.95%     1.95us   511.77K
bench_write(u64_4b_branch_free)                 111.29%     1.98us   504.98K
bench_write(u64_5b_branch_free)                 124.53%     1.96us   510.52K
bench_write(u64_6b_branch_free)                 145.48%     1.90us   526.18K
bench_write(u64_7b_branch_free)                 172.51%     1.97us   506.83K
bench_write(u64_8b_branch_free)                 174.92%     1.95us   514.13K
bench_write(u64_9b_branch_free)                 202.27%     1.97us   508.08K
bench_write(u64_10b_branch_free)                205.43%     1.96us   510.44K
bench_write(exponential_1b_branch_free)         105.67%     1.91us   523.63K
bench_write(exponential_2b_branch_free)         116.10%     1.95us   512.64K
bench_write(exponential_3b_branch_free)         119.08%     1.95us   513.34K

Reviewed By: embg

Differential Revision: D73513003
Summary:

Implemented an explicit SVE version of writeVarint.
Throughput for 64-bit types shows a ~15% improvement.
16-bit and 32-bit cases seem to show a small improvement as well.

All three functions are branch-free, their disassembly can be seen here: https://godbolt.org/z/jG5d8Wfe8

before:

bench_write(u16_any_branch_free)                110.66%     2.00us   500.10K
bench_write(u32_any_branch_free)                126.90%     2.00us   499.37K
bench_write(u64_any_branch_free)                193.56%     2.33us   429.37K
bench_write(u16_1b_branch_free)                 99.562%     1.91us   522.97K
bench_write(u16_2b_branch_free)                 114.92%     2.00us   500.59K
bench_write(u16_3b_branch_free)                 111.66%     2.00us   500.99K
bench_write(u32_1b_branch_free)                 97.918%     1.93us   518.38K
bench_write(u32_2b_branch_free)                 113.76%     1.99us   502.29K
bench_write(u32_3b_branch_free)                 111.14%     1.99us   503.03K
bench_write(u32_4b_branch_free)                 115.72%     1.97us   507.52K
bench_write(u32_5b_branch_free)                 122.05%     2.00us   498.82K
bench_write(u64_1b_branch_free)                 99.089%     1.95us   511.71K
bench_write(u64_2b_branch_free)                 90.484%     2.53us   396.00K
bench_write(u64_3b_branch_free)                 93.335%     2.38us   419.63K
bench_write(u64_4b_branch_free)                 100.61%     2.24us   446.86K
bench_write(u64_5b_branch_free)                 123.18%     2.37us   421.24K
bench_write(u64_6b_branch_free)                 120.10%     2.33us   429.84K
bench_write(u64_7b_branch_free)                 144.69%     2.36us   423.79K
bench_write(u64_8b_branch_free)                 149.44%     2.25us   443.92K
bench_write(u64_9b_branch_free)                 174.37%     2.31us   433.60K
bench_write(u64_10b_branch_free)                176.81%     2.28us   438.61K
bench_write(exponential_1b_branch_free)         108.05%     1.91us   522.52K
bench_write(exponential_2b_branch_free)         118.34%     1.98us   504.37K
bench_write(exponential_3b_branch_free)         114.22%     1.99us   501.87K

after:

bench_write(u16_any_branch_free)                115.30%     1.97us   507.43K
bench_write(u32_any_branch_free)                130.06%     1.97us   508.40K
bench_write(u64_any_branch_free)                226.45%     1.96us   509.18K
bench_write(u16_1b_branch_free)                 101.37%     1.84us   543.01K
bench_write(u16_2b_branch_free)                 116.65%     1.97us   508.51K
bench_write(u16_3b_branch_free)                 111.17%     1.96us   510.12K
bench_write(u32_1b_branch_free)                 99.679%     1.93us   519.42K
bench_write(u32_2b_branch_free)                 115.98%     1.98us   506.04K
bench_write(u32_3b_branch_free)                 111.45%     1.98us   503.85K
bench_write(u32_4b_branch_free)                 116.04%     1.95us   513.18K
bench_write(u32_5b_branch_free)                 124.59%     1.97us   508.35K
bench_write(u64_1b_branch_free)                 99.669%     1.91us   522.26K
bench_write(u64_2b_branch_free)                 117.53%     1.93us   518.86K
bench_write(u64_3b_branch_free)                 111.95%     1.95us   511.77K
bench_write(u64_4b_branch_free)                 111.29%     1.98us   504.98K
bench_write(u64_5b_branch_free)                 124.53%     1.96us   510.52K
bench_write(u64_6b_branch_free)                 145.48%     1.90us   526.18K
bench_write(u64_7b_branch_free)                 172.51%     1.97us   506.83K
bench_write(u64_8b_branch_free)                 174.92%     1.95us   514.13K
bench_write(u64_9b_branch_free)                 202.27%     1.97us   508.08K
bench_write(u64_10b_branch_free)                205.43%     1.96us   510.44K
bench_write(exponential_1b_branch_free)         105.67%     1.91us   523.63K
bench_write(exponential_2b_branch_free)         116.10%     1.95us   512.64K
bench_write(exponential_3b_branch_free)         119.08%     1.95us   513.34K

Reviewed By: embg

Differential Revision: D73513003
@facebook-github-bot

Copy link
Copy Markdown
Contributor

This pull request was exported from Phabricator. Differential Revision: D73513003

@facebook-github-bot

Copy link
Copy Markdown
Contributor

This pull request was exported from Phabricator. Differential Revision: D73513003

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

2 participants