qbe: Update to latest git and tweak some patches - oasis - Own branch of Oasis Linux (upstream: <https://git.sr.ht/~mcf/oasis/>)

commit: 066e68c4fe79238bc3377dea38585eb56b5934b1
parent f8b20538cc063224c6a1a2a8a65fbaa5e87c3f3c
Author: Michael Forney <mforney@mforney.org>
Date:   Tue, 26 Oct 2021 19:34:26 -0700

qbe: Update to latest git and tweak some patches

Diffstat:
D pkg/qbe/patch/0001-arm64-Handle-slots.patch 36 ------------------------------------
A pkg/qbe/patch/0001-arm64-prevent-stack-clobber-when-passing-structures-.patch 33 +++++++++++++++++++++++++++++++++
D pkg/qbe/patch/0002-arm64-Handle-slots-in-Ocopy-operands.patch 56 --------------------------------------------------------
A pkg/qbe/patch/0002-increase-NString-to-72.patch 25 +++++++++++++++++++++++++
D pkg/qbe/patch/0003-arm64-Prevent-stack-clobber-when-passing-structures-.patch 33 ---------------------------------
A pkg/qbe/patch/0003-fold-don-t-fold-invalid-addition-subtraction-rather-.patch 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D pkg/qbe/patch/0004-Increase-NString-to-96.patch 25 -------------------------
A pkg/qbe/patch/0004-gas-put-zero-data-into-.bss.patch 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D pkg/qbe/patch/0005-fold-Don-t-fold-invalid-addition-subtraction-rather-.patch 66 ------------------------------------------------------------------
D pkg/qbe/patch/0006-amd64-optimize-loading-0-into-floating-point-registe.patch 76 ----------------------------------------------------------------------------
D pkg/qbe/patch/0007-amd64-optimize-zeroing-of-integer-registers-as-well.patch 34 ----------------------------------
M pkg/qbe/ver 2 +-

13 files changed, 310 insertions(+), 327 deletions(-)
diff --git a/pkg/qbe/patch/0001-arm64-Handle-slots.patch b/pkg/qbe/patch/0001-arm64-Handle-slots.patch
@@ -1,36 +0,0 @@
-From c8849e9c7b382f92326434a6522a26829a6e20f8 Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Wed, 8 May 2019 18:29:28 -0700
-Subject: [PATCH] arm64: Handle slots
-
----
- arm64/emit.c | 13 +++++++++++--
- 1 file changed, 11 insertions(+), 2 deletions(-)
-
-diff --git a/arm64/emit.c b/arm64/emit.c
-index 59e1aae..9cc4a64 100644
---- a/arm64/emit.c
-+++ b/arm64/emit.c
-@@ -220,8 +220,17 @@ emitf(char *s, Ins *i, E *e)
- 			c = *s++;
- 			assert(c == '0' || c == '1');
- 			r = i->arg[c - '0'];
--			assert(isreg(r) && "TODO emit non reg addresses");
--			fprintf(e->f, "[%s]", rname(r.val, Kl));
-+			switch (rtype(r)) {
-+			default:
-+				die("TODO emit non reg addresses");
-+			case RTmp:
-+				assert(isreg(r));
-+				fprintf(e->f, "[%s]", rname(r.val, Kl));
-+				break;
-+			case RSlot:
-+				fprintf(e->f, "[sp, %"PRIu64"]", slot(r.val, e));
-+				break;
-+			}
- 			break;
- 		}
- 	}
--- 
-2.21.0
-
diff --git a/pkg/qbe/patch/0001-arm64-prevent-stack-clobber-when-passing-structures-.patch b/pkg/qbe/patch/0001-arm64-prevent-stack-clobber-when-passing-structures-.patch
@@ -0,0 +1,33 @@
+From 6c1744026545445511f1c500653bab859bc79b50 Mon Sep 17 00:00:00 2001
+From: Michael Forney <mforney@mforney.org>
+Date: Sat, 11 May 2019 19:38:13 -0700
+Subject: [PATCH] arm64: prevent stack clobber when passing structures < 8
+ bytes
+
+---
+ arm64/abi.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arm64/abi.c b/arm64/abi.c
+index 8209944..f37c892 100644
+--- a/arm64/abi.c
++++ b/arm64/abi.c
+@@ -312,12 +312,14 @@ stkblob(Ref r, Class *c, Fn *fn, Insl **ilp)
+ {
+ 	Insl *il;
+ 	int al;
++	uint64_t sz;
+ 
+ 	il = alloc(sizeof *il);
+ 	al = c->t->align - 2; /* NAlign == 3 */
+ 	if (al < 0)
+ 		al = 0;
+-	il->i = (Ins){Oalloc+al, Kl, r, {getcon(c->t->size, fn)}};
++	sz = c->class & Cptr ? c->t->size : c->size;
++	il->i = (Ins){Oalloc+al, Kl, r, {getcon(sz, fn)}};
+ 	il->link = *ilp;
+ 	*ilp = il;
+ }
+-- 
+2.32.0
+
diff --git a/pkg/qbe/patch/0002-arm64-Handle-slots-in-Ocopy-operands.patch b/pkg/qbe/patch/0002-arm64-Handle-slots-in-Ocopy-operands.patch
@@ -1,56 +0,0 @@
-From d9d890583d93f1bfdc38e4aa890350d4111b848a Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Thu, 9 May 2019 23:32:15 -0700
-Subject: [PATCH] arm64: Handle slots in Ocopy operands
-
----
- arm64/emit.c | 25 +++++++++++++++++++++----
- 1 file changed, 21 insertions(+), 4 deletions(-)
-
-diff --git a/arm64/emit.c b/arm64/emit.c
-index 9ebcfcd..5a3fe55 100644
---- a/arm64/emit.c
-+++ b/arm64/emit.c
-@@ -218,8 +218,8 @@ emitf(char *s, Ins *i, E *e)
- 			break;
- 		case 'M':
- 			c = *s++;
--			assert(c == '0' || c == '1');
--			r = i->arg[c - '0'];
-+			assert(c == '0' || c == '1' || c == '=');
-+			r = c == '=' ? i->to : i->arg[c - '0'];
- 			switch (rtype(r)) {
- 			default:
- 				die("TODO emit non reg addresses");
-@@ -307,9 +307,26 @@ emitins(Ins *i, E *e)
- 	case Ocopy:
- 		if (req(i->to, i->arg[0]))
- 			break;
--		if (rtype(i->arg[0]) != RCon)
-+		if (rtype(i->to) == RSlot) {
-+			if (rtype(i->arg[0]) == RSlot) {
-+				emitf("ldr %?, %M0\n\tstr %?, %M=", i, e);
-+			} else {
-+				assert(isreg(i->arg[0]));
-+				emitf("str %0, %M=", i, e);
-+			}
-+			break;
-+		}
-+		assert(isreg(i->to));
-+		switch (rtype(i->arg[0])) {
-+		case RCon:
-+			loadcon(&e->fn->con[i->arg[0].val], i->to.val, i->cls, e->f);
-+			break;
-+		case RSlot:
-+			emitf("ldr %=, %M0", i, e);
-+			break;
-+		default:
- 			goto Table;
--		loadcon(&e->fn->con[i->arg[0].val], i->to.val, i->cls, e->f);
-+		}
- 		break;
- 	case Oaddr:
- 		assert(rtype(i->arg[0]) == RSlot);
--- 
-2.32.0
-
diff --git a/pkg/qbe/patch/0002-increase-NString-to-72.patch b/pkg/qbe/patch/0002-increase-NString-to-72.patch
@@ -0,0 +1,25 @@
+From 294fedc93dbeac68f0beec1eeea62be30227b025 Mon Sep 17 00:00:00 2001
+From: Michael Forney <mforney@mforney.org>
+Date: Fri, 31 May 2019 13:31:04 -0700
+Subject: [PATCH] increase NString to 72
+
+---
+ all.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/all.h b/all.h
+index 37980d3..f49b4ef 100644
+--- a/all.h
++++ b/all.h
+@@ -31,7 +31,7 @@ typedef struct Dat Dat;
+ typedef struct Target Target;
+ 
+ enum {
+-	NString = 64,
++	NString = 72,
+ 	NIns    = 1 << 20,
+ 	NAlign  = 3,
+ 	NField  = 32,
+-- 
+2.32.0
+
diff --git a/pkg/qbe/patch/0003-arm64-Prevent-stack-clobber-when-passing-structures-.patch b/pkg/qbe/patch/0003-arm64-Prevent-stack-clobber-when-passing-structures-.patch
@@ -1,33 +0,0 @@
-From ffd2585ef162a6dcc42011a33bd69687048ab4a8 Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Sat, 11 May 2019 19:38:13 -0700
-Subject: [PATCH] arm64: Prevent stack clobber when passing structures < 8
- bytes
-
----
- arm64/abi.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/arm64/abi.c b/arm64/abi.c
-index f5b605a..4e80db2 100644
---- a/arm64/abi.c
-+++ b/arm64/abi.c
-@@ -308,12 +308,14 @@ stkblob(Ref r, Class *c, Fn *fn, Insl **ilp)
- {
- 	Insl *il;
- 	int al;
-+	uint64_t sz;
- 
- 	il = alloc(sizeof *il);
- 	al = c->t->align - 2; /* NAlign == 3 */
- 	if (al < 0)
- 		al = 0;
--	il->i = (Ins){Oalloc+al, Kl, r, {getcon(c->t->size, fn)}};
-+	sz = c->class & Cptr ? c->t->size : c->size;
-+	il->i = (Ins){Oalloc+al, Kl, r, {getcon(sz, fn)}};
- 	il->link = *ilp;
- 	*ilp = il;
- }
--- 
-2.21.0
-
diff --git a/pkg/qbe/patch/0003-fold-don-t-fold-invalid-addition-subtraction-rather-.patch b/pkg/qbe/patch/0003-fold-don-t-fold-invalid-addition-subtraction-rather-.patch
@@ -0,0 +1,66 @@
+From b3c8dfafafd7e749a12227c951f3faebc2572710 Mon Sep 17 00:00:00 2001
+From: Michael Forney <mforney@mforney.org>
+Date: Sun, 16 Jun 2019 01:38:27 -0700
+Subject: [PATCH] fold: don't fold invalid addition/subtraction rather than
+ failing
+
+This may happen in a branch QBE doesn't realize is unreachable,
+for example (simplified from real code found in ncurses)
+
+	data $str = { b "abcdef", b 0 }
+	function l $f(w %x) {
+	@start
+		%.1 =w ceqw %x, 0
+		jnz %.1, @logic_join, @logic_right
+	@logic_right
+		%p =l call $strchr(l $str, w %x)
+		%.2 =w ceql %p, 0
+	@logic_join
+		%.3 =w phi @start %.1, @logic_right %.2
+		jnz %.3, @fail, @return
+	@fail
+		ret 0
+	@return
+		%.4 =l sub %p, $str
+		ret %.4
+	}
+---
+ fold.c | 11 ++++-------
+ 1 file changed, 4 insertions(+), 7 deletions(-)
+
+diff --git a/fold.c b/fold.c
+index 2081a72..50a862e 100644
+--- a/fold.c
++++ b/fold.c
+@@ -343,7 +343,7 @@ foldint(Con *res, int op, int w, Con *cl, Con *cr)
+ 	if (op == Oadd) {
+ 		if (cl->type == CAddr) {
+ 			if (cr->type == CAddr)
+-				err("undefined addition (addr + addr)");
++				return 1;
+ 			lab = cl->label;
+ 			typ = CAddr;
+ 		}
+@@ -358,16 +358,13 @@ foldint(Con *res, int op, int w, Con *cl, Con *cr)
+ 				lab = cl->label;
+ 				typ = CAddr;
+ 			} else if (cl->label != cr->label)
+-				err("undefined substraction (addr1 - addr2)");
++				return 1;
+ 		}
+ 		else if (cr->type == CAddr)
+-			err("undefined substraction (num - addr)");
+-	}
+-	else if (cl->type == CAddr || cr->type == CAddr) {
+-		if (Ocmpl <= op && op <= Ocmpl1)
+ 			return 1;
+-		err("invalid address operand for '%s'", optab[op].name);
+ 	}
++	else if (cl->type == CAddr || cr->type == CAddr)
++		return 1;
+ 	switch (op) {
+ 	case Oadd:  x = l.u + r.u; break;
+ 	case Osub:  x = l.u - r.u; break;
+-- 
+2.32.0
+
diff --git a/pkg/qbe/patch/0004-Increase-NString-to-96.patch b/pkg/qbe/patch/0004-Increase-NString-to-96.patch
@@ -1,25 +0,0 @@
-From 948e221acc92d002662ffa609a252a3410a93001 Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Fri, 31 May 2019 13:31:04 -0700
-Subject: [PATCH] Increase NString to 96
-
----
- all.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/all.h b/all.h
-index 7f843a9..805a346 100644
---- a/all.h
-+++ b/all.h
-@@ -31,7 +31,7 @@ typedef struct Dat Dat;
- typedef struct Target Target;
- 
- enum {
--	NString = 64,
-+	NString = 96,
- 	NIns    = 1 << 20,
- 	NAlign  = 3,
- 	NField  = 32,
--- 
-2.28.0
-
diff --git a/pkg/qbe/patch/0004-gas-put-zero-data-into-.bss.patch b/pkg/qbe/patch/0004-gas-put-zero-data-into-.bss.patch
@@ -0,0 +1,102 @@
+From e81a67355f1a53739cbfd9797ac9d687efff05e8 Mon Sep 17 00:00:00 2001
+From: Michael Forney <mforney@mforney.org>
+Date: Tue, 28 Sep 2021 11:25:05 -0700
+Subject: [PATCH] gas: put zero data into .bss
+
+---
+ gas.c | 56 ++++++++++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 40 insertions(+), 16 deletions(-)
+
+diff --git a/gas.c b/gas.c
+index 8c31794..ce082dc 100644
+--- a/gas.c
++++ b/gas.c
+@@ -3,10 +3,26 @@
+ 
+ char *gasloc, *gassym;
+ 
++static void
++startdat(FILE *f, char *section, char *name, int align, int export, int zero)
++{
++	char *p;
++
++	if (section)
++		fprintf(f, ".section %s\n", section);
++	else
++		fprintf(f, "%s\n", zero ? ".bss" : ".data");
++	fprintf(f, ".balign %d\n", align);
++	p = name[0] == '"' ? "" : gassym;
++	if (export)
++		fprintf(f, ".globl %s%s\n", p, name);
++	fprintf(f, "%s%s:\n", p, name);
++}
++
+ void
+ gasemitdat(Dat *d, FILE *f)
+ {
+-	static int aligned;
++	static int aligned, export;
+ 	static char *dtoa[] = {
+ 		[DAlign] = ".balign",
+ 		[DB] = "\t.byte",
+@@ -14,34 +30,42 @@ gasemitdat(Dat *d, FILE *f)
+ 		[DW] = "\t.int",
+ 		[DL] = "\t.quad"
+ 	};
++	static char *name, *section;
++	static int64_t zero;
+ 	char *p;
+ 
+ 	switch (d->type) {
+ 	case DStart:
+ 		aligned = 0;
+-		if (d->u.str) {
+-			fprintf(f, ".section %s\n", d->u.str);
+-		} else {
+-			fprintf(f, ".data\n");
+-		}
++		zero = 0;
++		section = d->u.str;
+ 		break;
+ 	case DEnd:
++		if (zero != -1) {
++			startdat(f, section, name, aligned, export, 1);
++			fprintf(f, "\t.fill %"PRId64",1,0\n", zero);
++		}
++		break;
++	case DAlign:
++		aligned = d->u.num;
+ 		break;
+ 	case DName:
+-		if (!aligned)
+-			fprintf(f, ".balign 8\n");
+-		p = d->u.str[0] == '"' ? "" : gassym;
+-		if (d->export)
+-			fprintf(f, ".globl %s%s\n", p, d->u.str);
+-		fprintf(f, "%s%s:\n", p, d->u.str);
++		name = d->u.str;
++		export = d->export;
+ 		break;
+ 	case DZ:
+-		fprintf(f, "\t.fill %"PRId64",1,0\n", d->u.num);
++		if (zero != -1)
++			zero += d->u.num;
++		else
++			fprintf(f, "\t.fill %"PRId64",1,0\n", d->u.num);
+ 		break;
+ 	default:
+-		if (d->type == DAlign)
+-			aligned = 1;
+-
++		if (zero != -1) {
++			startdat(f, section, name, aligned, export, 0);
++			if (zero > 0)
++				fprintf(f, "\t.fill %"PRId64",1,0\n", zero);
++			zero = -1;
++		}
+ 		if (d->isstr) {
+ 			if (d->type != DB)
+ 				err("strings only supported for 'b' currently");
+-- 
+2.32.0
+
diff --git a/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch b/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch
@@ -0,0 +1,83 @@
+From a11da13e22a694f8fe4a81d894d433f50ce4af6b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
+Date: Sun, 11 Jul 2021 19:19:12 -0300
+Subject: [PATCH] amd64: optimize loading 0 into registers
+
+Loading +0 into a floating point register can be done using pxor or
+xorps instructions. Per [1], we went with pxor because it can run on all
+vector ALU ports, even if it's one byte longer.
+
+Similarly, an integer register can be zeroed with xor, which has a
+smaller encoding than mov with 0 immediate.
+
+To implement this, we special case fixarg to allow Ocopy when the
+value is +0 for floating point, and change emitins to emit pxor/xor
+when it encounters a copy from 0.
+
+Co-authored-by: Michael Forney <mforney@mforney.org>
+
+[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
+---
+ amd64/emit.c | 12 ++++++++++++
+ amd64/isel.c | 12 +++++++-----
+ 2 files changed, 19 insertions(+), 5 deletions(-)
+
+diff --git a/amd64/emit.c b/amd64/emit.c
+index a888000..7aeeff5 100644
+--- a/amd64/emit.c
++++ b/amd64/emit.c
+@@ -443,6 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f)
+ 		if (req(i.to, i.arg[0]))
+ 			break;
+ 		t0 = rtype(i.arg[0]);
++		if (t0 == RCon
++		&& fn->con[i.arg[0].val].type == CBits
++		&& fn->con[i.arg[0].val].bits.i == 0) {
++			if (isreg(i.to)) {
++				if (KBASE(i.cls) == 0)
++					emitf("xor%k %=, %=", &i, fn, f);
++				else
++					emitf("pxor %D=, %D=", &i, fn, f);
++				break;
++			}
++			i.cls = KWIDE(i.cls) ? Kl : Kw;
++		}
+ 		if (i.cls == Kl
+ 		&& t0 == RCon
+ 		&& fn->con[i.arg[0].val].type == CBits) {
+diff --git a/amd64/isel.c b/amd64/isel.c
+index 607c176..1c902f5 100644
+--- a/amd64/isel.c
++++ b/amd64/isel.c
+@@ -69,7 +69,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
+ 	r1 = r0 = *r;
+ 	s = rslot(r0, fn);
+ 	op = i ? i->op : Ocopy;
+-	if (KBASE(k) == 1 && rtype(r0) == RCon) {
++	if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
+ 		/* load floating points from memory
+ 		 * slots, they can't be used as
+ 		 * immediates
+@@ -84,13 +84,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
+ 		a.offset.label = intern(buf);
+ 		fn->mem[fn->nmem-1] = a;
+ 	}
+-	else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
++	else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
+ 		/* load constants that do not fit in
+ 		 * a 32bit signed integer into a
+-		 * long temporary
++		 * long temporary OR
++		 * load positive zero into a floating
++		 * point register
+ 		 */
+-		r1 = newtmp("isel", Kl, fn);
+-		emit(Ocopy, Kl, r1, r0, R);
++		r1 = newtmp("isel", k, fn);
++		emit(Ocopy, k, r1, r0, R);
+ 	}
+ 	else if (s != -1) {
+ 		/* load fast locals' addresses into
+-- 
+2.32.0
+
diff --git a/pkg/qbe/patch/0005-fold-Don-t-fold-invalid-addition-subtraction-rather-.patch b/pkg/qbe/patch/0005-fold-Don-t-fold-invalid-addition-subtraction-rather-.patch
@@ -1,66 +0,0 @@
-From 264b07e0cb0ce869cfcdab0a3e66c92a99de5dee Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Sun, 16 Jun 2019 01:38:27 -0700
-Subject: [PATCH] fold: Don't fold invalid addition/subtraction rather than
- failing
-
-This may happen in a branch QBE doesn't realize is unreachable,
-for example (simplified from real code found in ncurses)
-
-	data $str = { b "abcdef", b 0 }
-	function l $f(w %x) {
-	@start
-		%.1 =w ceqw %x, 0
-		jnz %.1, @logic_join, @logic_right
-	@logic_right
-		%p =l call $strchr(l $str, w %x)
-		%.2 =w ceql %p, 0
-	@logic_join
-		%.3 =w phi @start %.1, @logic_right %.2
-		jnz %.3, @fail, @return
-	@fail
-		ret 0
-	@return
-		%.4 =l sub %p, $str
-		ret %.4
-	}
----
- fold.c | 11 ++++-------
- 1 file changed, 4 insertions(+), 7 deletions(-)
-
-diff --git a/fold.c b/fold.c
-index 0a3945f..9e1a12d 100644
---- a/fold.c
-+++ b/fold.c
-@@ -343,7 +343,7 @@ foldint(Con *res, int op, int w, Con *cl, Con *cr)
- 	if (op == Oadd) {
- 		if (cl->type == CAddr) {
- 			if (cr->type == CAddr)
--				err("undefined addition (addr + addr)");
-+				return 1;
- 			lab = cl->label;
- 			typ = CAddr;
- 		}
-@@ -358,16 +358,13 @@ foldint(Con *res, int op, int w, Con *cl, Con *cr)
- 				lab = cl->label;
- 				typ = CAddr;
- 			} else if (cl->label != cr->label)
--				err("undefined substraction (addr1 - addr2)");
-+				return 1;
- 		}
- 		else if (cr->type == CAddr)
--			err("undefined substraction (num - addr)");
--	}
--	else if (cl->type == CAddr || cr->type == CAddr) {
--		if (Ocmpl <= op && op <= Ocmpl1)
- 			return 1;
--		err("invalid address operand for '%s'", optab[op].name);
- 	}
-+	else if (cl->type == CAddr || cr->type == CAddr)
-+		return 1;
- 	switch (op) {
- 	case Oadd:  x = l.u + r.u; break;
- 	case Osub:  x = l.u - r.u; break;
--- 
-2.22.0
-
diff --git a/pkg/qbe/patch/0006-amd64-optimize-loading-0-into-floating-point-registe.patch b/pkg/qbe/patch/0006-amd64-optimize-loading-0-into-floating-point-registe.patch
@@ -1,76 +0,0 @@
-From 1e0c08a288a5f7993dd8565ace35f1ecfc614544 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
-Date: Sun, 11 Jul 2021 19:19:12 -0300
-Subject: [PATCH] amd64: optimize loading +0 into floating point registers
-
-Loading +0 into a floating point register can be done using pxor or
-xorps instructions. Per [1], we went with pxor because it can run on all
-vector ALU ports, even if it's one byte longer.
-
-To implement it, we special case fixarg to emit Ocopy when the value is
-+0, and emitins to treat Ocopy for floating point args specially. Since
-0. == -0., we can't check if bits.d or bits.f are equal to 0. To avoid
-requiring signbit(), we inspect bits.i directly; this assumes the bits
-union is always zero-initialized.
-
-[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
----
- amd64/emit.c |  8 ++++++++
- amd64/isel.c | 12 +++++++-----
- 2 files changed, 15 insertions(+), 5 deletions(-)
-
-diff --git a/amd64/emit.c b/amd64/emit.c
-index 09b90d5..311b8c6 100644
---- a/amd64/emit.c
-+++ b/amd64/emit.c
-@@ -443,6 +443,14 @@ emitins(Ins i, Fn *fn, FILE *f)
- 		if (req(i.to, i.arg[0]))
- 			break;
- 		t0 = rtype(i.arg[0]);
-+		if (isreg(i.to)
-+		&& KBASE(i.cls) == 1
-+		&& t0 == RCon
-+		&& fn->con[i.arg[0].val].type == CBits) {
-+			assert(fn->con[i.arg[0].val].bits.i == 0);
-+			emitf("pxor %D=, %D=", &i, fn, f);
-+			break;
-+		}
- 		if (i.cls == Kl
- 		&& t0 == RCon
- 		&& fn->con[i.arg[0].val].type == CBits) {
-diff --git a/amd64/isel.c b/amd64/isel.c
-index 607c176..1c902f5 100644
---- a/amd64/isel.c
-+++ b/amd64/isel.c
-@@ -69,7 +69,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
- 	r1 = r0 = *r;
- 	s = rslot(r0, fn);
- 	op = i ? i->op : Ocopy;
--	if (KBASE(k) == 1 && rtype(r0) == RCon) {
-+	if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
- 		/* load floating points from memory
- 		 * slots, they can't be used as
- 		 * immediates
-@@ -84,13 +84,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
- 		a.offset.label = intern(buf);
- 		fn->mem[fn->nmem-1] = a;
- 	}
--	else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
-+	else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
- 		/* load constants that do not fit in
- 		 * a 32bit signed integer into a
--		 * long temporary
-+		 * long temporary OR
-+		 * load positive zero into a floating
-+		 * point register
- 		 */
--		r1 = newtmp("isel", Kl, fn);
--		emit(Ocopy, Kl, r1, r0, R);
-+		r1 = newtmp("isel", k, fn);
-+		emit(Ocopy, k, r1, r0, R);
- 	}
- 	else if (s != -1) {
- 		/* load fast locals' addresses into
--- 
-2.32.0
-
diff --git a/pkg/qbe/patch/0007-amd64-optimize-zeroing-of-integer-registers-as-well.patch b/pkg/qbe/patch/0007-amd64-optimize-zeroing-of-integer-registers-as-well.patch
@@ -1,34 +0,0 @@
-From 1b61d04de8d62821eec915eec6bde6b9a0a2d1c9 Mon Sep 17 00:00:00 2001
-From: Michael Forney <mforney@mforney.org>
-Date: Mon, 30 Aug 2021 13:40:48 -0700
-Subject: [PATCH] amd64: optimize zeroing of integer registers as well
-
----
- amd64/emit.c | 10 ++++++----
- 1 file changed, 6 insertions(+), 4 deletions(-)
-
-diff --git a/amd64/emit.c b/amd64/emit.c
-index 311b8c6..015b921 100644
---- a/amd64/emit.c
-+++ b/amd64/emit.c
-@@ -444,11 +444,13 @@ emitins(Ins i, Fn *fn, FILE *f)
- 			break;
- 		t0 = rtype(i.arg[0]);
- 		if (isreg(i.to)
--		&& KBASE(i.cls) == 1
- 		&& t0 == RCon
--		&& fn->con[i.arg[0].val].type == CBits) {
--			assert(fn->con[i.arg[0].val].bits.i == 0);
--			emitf("pxor %D=, %D=", &i, fn, f);
-+		&& fn->con[i.arg[0].val].type == CBits
-+		&& fn->con[i.arg[0].val].bits.i == 0) {
-+			if (KBASE(i.cls) == 0)
-+				emitf("xor%k %=, %=", &i, fn, f);
-+			else
-+				emitf("pxor %D=, %D=", &i, fn, f);
- 			break;
- 		}
- 		if (i.cls == Kl
--- 
-2.32.0
-
diff --git a/pkg/qbe/ver b/pkg/qbe/ver
@@ -1 +1 @@
-6a69210b0f r0
+900805a8fe r0

D	pkg/qbe/patch/0001-arm64-Handle-slots.patch	36	------------------------------------
A	pkg/qbe/patch/0001-arm64-prevent-stack-clobber-when-passing-structures-.patch	33	+++++++++++++++++++++++++++++++++
D	pkg/qbe/patch/0002-arm64-Handle-slots-in-Ocopy-operands.patch	56	--------------------------------------------------------
A	pkg/qbe/patch/0002-increase-NString-to-72.patch	25	+++++++++++++++++++++++++
D	pkg/qbe/patch/0003-arm64-Prevent-stack-clobber-when-passing-structures-.patch	33	---------------------------------
A	pkg/qbe/patch/0003-fold-don-t-fold-invalid-addition-subtraction-rather-.patch	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	pkg/qbe/patch/0004-Increase-NString-to-96.patch	25	-------------------------
A	pkg/qbe/patch/0004-gas-put-zero-data-into-.bss.patch	102	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch	83	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	pkg/qbe/patch/0005-fold-Don-t-fold-invalid-addition-subtraction-rather-.patch	66	------------------------------------------------------------------
D	pkg/qbe/patch/0006-amd64-optimize-loading-0-into-floating-point-registe.patch	76	----------------------------------------------------------------------------
D	pkg/qbe/patch/0007-amd64-optimize-zeroing-of-integer-registers-as-well.patch	34	----------------------------------
M	pkg/qbe/ver	2	+-