From 88e69a1fcc1e67dec3025af64736a84532528242 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:07:58 +0100 Subject: bpf, x64: save one byte per shl/shr/sar when imm is 1 When we shift by one, we can use a different encoding where imm is not explicitly needed, which saves 1 byte per such op. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4923d92..4bc36bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -640,7 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_RSH: b3 = 0xE8; break; case BPF_ARSH: b3 = 0xF8; break; } - EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); + + if (imm32 == 1) + EMIT2(0xD1, add_1reg(b3, dst_reg)); + else + EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); break; case BPF_ALU | BPF_LSH | BPF_X: -- cgit v1.1 From 6fe8b9c1f41dfe3209dabc5bd0726e003a065288 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:07:59 +0100 Subject: bpf, x64: save several bytes by using mov over movabsq when possible While analyzing some of the more complex BPF programs from Cilium, I found that LLVM generally prefers to emit LD_IMM64 instead of MOV32 BPF instructions for loading unsigned 32-bit immediates into a register. Given we cannot change the current/stable LLVM versions that are already out there, lets optimize this case such that the JIT prefers to emit 'mov %eax, imm32' over 'movabsq %rax, imm64' whenever suitable in order to reduce the image size by 4-5 bytes per such load in the typical case, reducing image size on some of the bigger programs by up to 4%. emit_mov_imm32() and emit_mov_imm64() have been added as helpers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 125 ++++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 51 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4bc36bd..f3e5cd8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -60,7 +60,12 @@ static bool is_imm8(int value) static bool is_simm32(s64 value) { - return value == (s64) (s32) value; + return value == (s64)(s32)value; +} + +static bool is_uimm32(u64 value) +{ + return value == (u64)(u32)value; } /* mov dst, src */ @@ -355,6 +360,68 @@ static void emit_load_skb_data_hlen(u8 **pprog) *pprog = prog; } +static void emit_mov_imm32(u8 **pprog, bool sign_propagate, + u32 dst_reg, const u32 imm32) +{ + u8 *prog = *pprog; + u8 b1, b2, b3; + int cnt = 0; + + /* optimization: if imm32 is positive, use 'mov %eax, imm32' + * (which zero-extends imm32) to save 2 bytes. + */ + if (sign_propagate && (s32)imm32 < 0) { + /* 'mov %rax, imm32' sign extends imm32 */ + b1 = add_1mod(0x48, dst_reg); + b2 = 0xC7; + b3 = 0xC0; + EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); + goto done; + } + + /* optimization: if imm32 is zero, use 'xor %eax, %eax' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + goto done; + } + + /* mov %eax, imm32 */ + if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); +done: + *pprog = prog; +} + +static void emit_mov_imm64(u8 **pprog, u32 dst_reg, + const u32 imm32_hi, const u32 imm32_lo) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + /* For emitting plain u32, where sign bit must not be + * propagated LLVM tends to load imm64 over mov32 + * directly, so save couple of bytes by just doing + * 'mov %eax, imm32' instead. + */ + emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else { + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(imm32_lo, 4); + EMIT(imm32_hi, 4); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -377,7 +444,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; - u8 b1 = 0, b2 = 0, b3 = 0; + u8 b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; bool reload_skb_data; @@ -485,58 +552,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU64 | BPF_MOV | BPF_K: - /* optimization: if imm32 is positive, - * use 'mov eax, imm32' (which zero-extends imm32) - * to save 2 bytes - */ - if (imm32 < 0) { - /* 'mov rax, imm32' sign extends imm32 */ - b1 = add_1mod(0x48, dst_reg); - b2 = 0xC7; - b3 = 0xC0; - EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); - break; - } - case BPF_ALU | BPF_MOV | BPF_K: - /* optimization: if imm32 is zero, use 'xor ,' - * to save 3 bytes. - */ - if (imm32 == 0) { - if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); - break; - } - - /* mov %eax, imm32 */ - if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); - EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); + emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, imm32); break; case BPF_LD | BPF_IMM | BPF_DW: - /* optimization: if imm64 is zero, use 'xor ,' - * to save 7 bytes. - */ - if (insn[0].imm == 0 && insn[1].imm == 0) { - b1 = add_2mod(0x48, dst_reg, dst_reg); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); - - insn++; - i++; - break; - } - - /* movabsq %rax, imm64 */ - EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); - EMIT(insn[0].imm, 4); - EMIT(insn[1].imm, 4); - + emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm); insn++; i++; break; @@ -604,7 +626,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT_mov(BPF_REG_0, src_reg); else /* mov rax, imm32 */ - EMIT3_off32(0x48, 0xC7, 0xC0, imm32); + emit_mov_imm32(&prog, true, + BPF_REG_0, imm32); if (BPF_CLASS(insn->code) == BPF_ALU64) EMIT1(add_1mod(0x48, AUX_REG)); -- cgit v1.1 From d806a0cf2a1ddb97c91d902ef1c8219e1e2b2c4c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:00 +0100 Subject: bpf, x64: save several bytes when mul dest is r0/r3 anyway Instead of unconditionally performing push/pop on rax/rdx in case of multiplication, we can save a few bytes in case of dest register being either BPF r0 (rax) or r3 (rdx) since the result is written in there anyway. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f3e5cd8..9895ca3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -615,8 +615,10 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_X: - EMIT1(0x50); /* push rax */ - EMIT1(0x52); /* push rdx */ + if (dst_reg != BPF_REG_0) + EMIT1(0x50); /* push rax */ + if (dst_reg != BPF_REG_3) + EMIT1(0x52); /* push rdx */ /* mov r11, dst_reg */ EMIT_mov(AUX_REG, dst_reg); @@ -636,14 +638,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* mul(q) r11 */ EMIT2(0xF7, add_1reg(0xE0, AUX_REG)); - /* mov r11, rax */ - EMIT_mov(AUX_REG, BPF_REG_0); - - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ - - /* mov dst_reg, r11 */ - EMIT_mov(dst_reg, AUX_REG); + if (dst_reg != BPF_REG_3) + EMIT1(0x5A); /* pop rdx */ + if (dst_reg != BPF_REG_0) { + /* mov dst_reg, rax */ + EMIT_mov(dst_reg, BPF_REG_0); + EMIT1(0x58); /* pop rax */ + } break; /* shifts */ -- cgit v1.1 From 4c38e2f386b4fc5fd95d1203c74819948e2e903d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:01 +0100 Subject: bpf, x64: save few bytes when mul is in alu32 Add a generic emit_mov_reg() helper in order to reuse it in BPF multiplication to load the src into rax, we can save a few bytes in alu32 while doing so. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9895ca3..5b8fc13 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -422,6 +422,24 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, *pprog = prog; } +static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) { + /* mov dst, src */ + EMIT_mov(dst_reg, src_reg); + } else { + /* mov32 dst, src */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -480,16 +498,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; - /* mov dst, src */ case BPF_ALU64 | BPF_MOV | BPF_X: - EMIT_mov(dst_reg, src_reg); - break; - - /* mov32 dst, src */ case BPF_ALU | BPF_MOV | BPF_X: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); - EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + emit_mov_reg(&prog, + BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, src_reg); break; /* neg dst */ @@ -615,6 +628,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_X: + { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + if (dst_reg != BPF_REG_0) EMIT1(0x50); /* push rax */ if (dst_reg != BPF_REG_3) @@ -624,14 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT_mov(AUX_REG, dst_reg); if (BPF_SRC(insn->code) == BPF_X) - /* mov rax, src_reg */ - EMIT_mov(BPF_REG_0, src_reg); + emit_mov_reg(&prog, is64, BPF_REG_0, src_reg); else - /* mov rax, imm32 */ - emit_mov_imm32(&prog, true, - BPF_REG_0, imm32); + emit_mov_imm32(&prog, is64, BPF_REG_0, imm32); - if (BPF_CLASS(insn->code) == BPF_ALU64) + if (is64) EMIT1(add_1mod(0x48, AUX_REG)); else if (is_ereg(AUX_REG)) EMIT1(add_1mod(0x40, AUX_REG)); @@ -646,7 +659,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(0x58); /* pop rax */ } break; - + } /* shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: -- cgit v1.1 From 0869175220b339b81de48872c8198c3ed75782e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:02 +0100 Subject: bpf, x64: save 5 bytes in prologue when ebpf insns came from cbpf While it's rather cumbersome to reduce prologue for cBPF->eBPF migrations wrt spill/fill for r15 which is callee saved register due to bpf_error path in bpf_jit.S that is both used by migrations as well as native eBPF, we can still trivially save 5 bytes in prologue for the former since tail calls can never be used there. cBPF->eBPF migrations also have their own custom prologue in BPF asm that xors A and X reg anyway, so it's fine we skip this here. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5b8fc13..70f9748 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -216,7 +216,7 @@ struct jit_context { /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; int cnt = 0; @@ -251,18 +251,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* mov qword ptr [rbp+24],r15 */ EMIT4(0x4C, 0x89, 0x7D, 24); - /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls - * we need to reset the counter to 0. It's done in two instructions, - * resetting rax register to 0 (xor on eax gets 0 extended), and - * moving it to the counter location. - */ + if (!ebpf_from_cbpf) { + /* Clear the tail call counter (tail_call_cnt): for eBPF tail + * calls we need to reset the counter to 0. It's done in two + * instructions, resetting rax register to 0, and moving it + * to the counter location. + */ - /* xor eax, eax */ - EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp+32], rax */ - EMIT4(0x48, 0x89, 0x45, 32); + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + } - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; } @@ -453,7 +456,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog, bpf_prog->aux->stack_depth); + emit_prologue(&prog, bpf_prog->aux->stack_depth, + bpf_prog_was_classic(bpf_prog)); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); -- cgit v1.1 From f8c193ca1f7d3b69c031970815d14e09de396ee8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 22 Feb 2018 22:58:33 +0100 Subject: arm: mvebu: 370-rd: Enable PHY interrupt handling The Ethernet switch has an embedded interrupt controller. Interrupts from the embedded PHYs are part of this interrupt controller. Explicitly list the MDIO bus the embedded PHYs are on, and wire up the interrupts. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- arch/arm/boot/dts/armada-370-rd.dts | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/armada-370-rd.dts b/arch/arm/boot/dts/armada-370-rd.dts index 8b2fa9a..c28afb2 100644 --- a/arch/arm/boot/dts/armada-370-rd.dts +++ b/arch/arm/boot/dts/armada-370-rd.dts @@ -56,6 +56,7 @@ /dts-v1/; #include +#include #include #include "armada-370.dtsi" @@ -243,6 +244,8 @@ #address-cells = <1>; #size-cells = <0>; reg = <0x10>; + interrupt-controller; + #interrupt-cells = <2>; ports { #address-cells = <1>; @@ -278,6 +281,35 @@ }; }; }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + switchphy0: switchphy@0 { + reg = <0>; + interrupt-parent = <&switch>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; + + switchphy1: switchphy@1 { + reg = <1>; + interrupt-parent = <&switch>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH>; + }; + + switchphy2: switchphy@2 { + reg = <2>; + interrupt-parent = <&switch>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; + }; + + switchphy3: switchphy@3 { + reg = <3>; + interrupt-parent = <&switch>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; + }; + }; }; }; -- cgit v1.1 From 71d22d58b6e507147a9008ce2fd690cf311c97f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Feb 2018 22:13:52 +0100 Subject: bpf, x64: remove bpf_flush_icache Unlike other archs flush_icache_range() is a noop on x64, therefore remove the JIT's bpf_flush_icache() altogether since not needed. Reported-by: Linus Torvalds Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index cbf94d4..eb661ff 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,10 +11,10 @@ #include #include #include -#include +#include + #include #include -#include /* * assembly code in arch/x86/net/bpf_jit.S @@ -103,16 +103,6 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JLE 0x7E #define X86_JG 0x7F -static void bpf_flush_icache(void *start, void *end) -{ - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - smp_wmb(); - flush_icache_range((unsigned long)start, (unsigned long)end); - set_fs(old_fs); -} - #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) @@ -1266,7 +1256,6 @@ skip_init_addrs: bpf_jit_dump(prog->len, proglen, pass + 1, image); if (image) { - bpf_flush_icache(header, image + proglen); if (!prog->is_func || extra_pass) { bpf_jit_binary_lock_ro(header); } else { -- cgit v1.1 From 43bf2e6d69dd6c2cea7a28763893a3dff34b7873 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 1 Mar 2018 18:29:28 -0500 Subject: net/mac89x0: Convert to platform_driver Apparently these Dayna cards don't have a pseudoslot declaration ROM which means they can't be probed like NuBus cards. Cc: Geert Uytterhoeven Signed-off-by: Finn Thain Acked-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- arch/m68k/mac/config.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index d3d4352..c73eb82 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -1088,6 +1088,10 @@ int __init mac_platform_init(void) macintosh_config->expansion_type == MAC_EXP_PDS_COMM) platform_device_register_simple("macsonic", -1, NULL, 0); + if (macintosh_config->expansion_type == MAC_EXP_PDS || + macintosh_config->expansion_type == MAC_EXP_PDS_COMM) + platform_device_register_simple("mac89x0", -1, NULL, 0); + if (macintosh_config->ether_type == MAC_ETHER_MACE) platform_device_register_simple("macmace", -1, NULL, 0); -- cgit v1.1