| /////////////////////////////////////////////////////////////////////////////// |
| // |
| /// \file arm64.c |
| /// \brief Filter for ARM64 binaries |
| /// |
| /// This converts ARM64 relative addresses in the BL and ADRP immediates |
| /// to absolute values to increase redundancy of ARM64 code. |
| /// |
| /// Converting B or ADR instructions was also tested but it's not useful. |
| /// A majority of the jumps for the B instruction are very small (+/- 0xFF). |
| /// These are typical for loops and if-statements. Encoding them to their |
| /// absolute address reduces redundancy since many of the small relative |
| /// jump values are repeated, but very few of the absolute addresses are. |
| // |
| // Authors: Lasse Collin |
| // Jia Tan |
| // Igor Pavlov |
| // |
| // This file has been put into the public domain. |
| // You can do whatever you want with this file. |
| // |
| /////////////////////////////////////////////////////////////////////////////// |
| |
| #include "simple_private.h" |
| |
| |
| static size_t |
| arm64_code(void *simple lzma_attribute((__unused__)), |
| uint32_t now_pos, bool is_encoder, |
| uint8_t *buffer, size_t size) |
| { |
| size_t i; |
| |
| // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower |
| // with auto-vectorization that is enabled by default with -O2. |
| // Such vectorization bloat happens with -O2 when targeting ARM64 too |
| // but performance hasn't been tested. |
| #ifdef __clang__ |
| # pragma clang loop vectorize(disable) |
| #endif |
| for (i = 0; i + 4 <= size; i += 4) { |
| uint32_t pc = (uint32_t)(now_pos + i); |
| uint32_t instr = read32le(buffer + i); |
| |
| if ((instr >> 26) == 0x25) { |
| // BL instruction: |
| // The full 26-bit immediate is converted. |
| // The range is +/-128 MiB. |
| // |
| // Using the full range is helps quite a lot with |
| // big executables. Smaller range would reduce false |
| // positives in non-code sections of the input though |
| // so this is a compromise that slightly favors big |
| // files. With the full range only six bits of the 32 |
| // need to match to trigger a conversion. |
| const uint32_t src = instr; |
| instr = 0x94000000; |
| |
| pc >>= 2; |
| if (!is_encoder) |
| pc = 0U - pc; |
| |
| instr |= (src + pc) & 0x03FFFFFF; |
| write32le(buffer + i, instr); |
| |
| } else if ((instr & 0x9F000000) == 0x90000000) { |
| // ADRP instruction: |
| // Only values in the range +/-512 MiB are converted. |
| // |
| // Using less than the full +/-4 GiB range reduces |
| // false positives on non-code sections of the input |
| // while being excellent for executables up to 512 MiB. |
| // The positive effect of ADRP conversion is smaller |
| // than that of BL but it also doesn't hurt so much in |
| // non-code sections of input because, with +/-512 MiB |
| // range, nine bits of 32 need to match to trigger a |
| // conversion (two 10-bit match choices = 9 bits). |
| const uint32_t src = ((instr >> 29) & 3) |
| | ((instr >> 3) & 0x001FFFFC); |
| |
| // With the addition only one branch is needed to |
| // check the +/- range. This is usually false when |
| // processing ARM64 code so branch prediction will |
| // handle it well in terms of performance. |
| // |
| //if ((src & 0x001E0000) != 0 |
| // && (src & 0x001E0000) != 0x001E0000) |
| if ((src + 0x00020000) & 0x001C0000) |
| continue; |
| |
| instr &= 0x9000001F; |
| |
| pc >>= 12; |
| if (!is_encoder) |
| pc = 0U - pc; |
| |
| const uint32_t dest = src + pc; |
| instr |= (dest & 3) << 29; |
| instr |= (dest & 0x0003FFFC) << 3; |
| instr |= (0U - (dest & 0x00020000)) & 0x00E00000; |
| write32le(buffer + i, instr); |
| } |
| } |
| |
| return i; |
| } |
| |
| |
| static lzma_ret |
| arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator, |
| const lzma_filter_info *filters, bool is_encoder) |
| { |
| return lzma_simple_coder_init(next, allocator, filters, |
| &arm64_code, 0, 4, 4, is_encoder); |
| } |
| |
| |
| #ifdef HAVE_ENCODER_ARM64 |
| extern lzma_ret |
| lzma_simple_arm64_encoder_init(lzma_next_coder *next, |
| const lzma_allocator *allocator, |
| const lzma_filter_info *filters) |
| { |
| return arm64_coder_init(next, allocator, filters, true); |
| } |
| #endif |
| |
| |
| #ifdef HAVE_DECODER_ARM64 |
| extern lzma_ret |
| lzma_simple_arm64_decoder_init(lzma_next_coder *next, |
| const lzma_allocator *allocator, |
| const lzma_filter_info *filters) |
| { |
| return arm64_coder_init(next, allocator, filters, false); |
| } |
| #endif |