From 431a1bf775d468bcd788c3dd716b97cc0fca1f34 Mon Sep 17 00:00:00 2001
From: oharboe <oharboe>
Date: Mon, 18 Aug 2008 12:00:34 +0000
Subject: 	* duplicated crt0.s and some other stuff from libgloss into 
 sw/startup. This makes it easier to tinker w/startup code.

---
 zpu/ChangeLog           |   3 +
 zpu/docs/zpu_arch.html  |  16 +-
 zpu/sw/startup/crt0.S   | 957 ++++++++++++++++++++++++++++++++++++++++++++++++
 zpu/sw/startup/crt_io.c |  91 +++++
 zpu/sw/startup/time.c   |  32 ++
 5 files changed, 1098 insertions(+), 1 deletion(-)
 create mode 100644 zpu/sw/startup/crt0.S
 create mode 100644 zpu/sw/startup/crt_io.c
 create mode 100644 zpu/sw/startup/time.c
diff --git a/zpu/ChangeLog b/zpu/ChangeLog
index 88bc650..02adb2c 100644
--- a/zpu/ChangeLog
+++ b/zpu/ChangeLog
@@ -1,3 +1,6 @@
+2008-08-18 Øyvind Harboe
+	* duplicated crt0.s and some other stuff from libgloss into
+	sw/startup. This makes it easier to tinker w/startup code.
 2008-08-08 Salvador E. Tropea
 	* zpu/hdl/zpu4/core/histogram.perl - generate opcode histogram from
 	HDL simulation output
diff --git a/zpu/docs/zpu_arch.html b/zpu/docs/zpu_arch.html
index ccbd0df..d8d982d 100644
--- a/zpu/docs/zpu_arch.html
+++ b/zpu/docs/zpu_arch.html
@@ -5,6 +5,7 @@
 <li> <a href="#started">Getting started</a>
 <li> <a href="#introduction">Introduction</a>
 <li> <a href="#instructionset">Instruction set</a>
+<li> <a href="#startup">Custom startup code (aka crt0.s)</a>
 <li> <a href="#implementing">Implementing your own ZPU</a>
 <li> <a href="#vectors">Jump vectors</a>
 <li> <a href="#memorymap">Memory map</a>
@@ -817,7 +818,20 @@ int address = pop();<br>
     
 	
 </table>
-	
+<a name="startup"/>
+<h1>Custom startup code (aka crt0.s)</h1>
+To minimize the size of an application, one important trick is to
+strip down the startup code. The startup code contains emulation
+of instructions that may never be used by a particular application.
+<p>
+The startup code is found in the GCC source code under gcc/libgloss/zpu,
+but to make the startup code more available, it has been duplicated
+into <a href="../sw/startup">zpu/sw/startup</a> 
+<p>
+To minimize startup size, see <a href="../roadshow/roadshow/codesize/index.html">codesize</a>
+demo. This is pretty standard GCC stuff and simple enough once you've
+been over it a couple of times.
+
 <a name="implementing"/>
 <h1>Implementing your own ZPU</h1>
 One of the neat things about the ZPU is that the instruction set and architecture
diff --git a/zpu/sw/startup/crt0.S b/zpu/sw/startup/crt0.S
new file mode 100644
index 0000000..00870c4
--- /dev/null
+++ b/zpu/sw/startup/crt0.S
@@ -0,0 +1,957 @@
+/* Startup code for ZPU
+   Copyright (C) 2005 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file with other programs, and to distribute
+those programs without any restriction coming from the use of this
+file.  (The General Public License restrictions do apply in other
+respects; for example, they cover modification of the file, and
+distribution when not linked into another program.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+	.file	"crt0.S"
+	
+	
+	
+	
+;	.section ".fixed_vectors","ax"
+; KLUDGE!!! we remove the executable bit to avoid relaxation 
+	.section ".fixed_vectors","a" 
+
+; DANGER!!!! 
+; we need to align these code sections to 32 bytes, which
+; means we must not use any assembler instructions that are relaxed
+; at linker time
+; DANGER!!!! 
+
+	.macro fixedim value
+			im \value
+	.endm
+
+	.macro  jsr address
+	
+			im 0		; save R0
+			load
+			im 4		; save R1
+			load
+			im 8		; save R2
+			load
+	
+			fixedim \address
+			call
+			
+			im 8
+			store		; restore R2
+			im 4
+			store		; restore R1
+			im 0
+			store		; restore R0
+	.endm
+
+
+	.macro  jmp address
+			fixedim \address
+			poppc
+	.endm
+		
+
+	.macro fast_neg
+	not
+	im 1
+	add
+	.endm
+	
+	.macro cimpl funcname
+	; save R0
+	im 0
+	load
+	
+	; save R1
+	im 4
+	load
+	
+	; save R2
+	im 8
+	load
+	
+	loadsp 20
+	loadsp 20
+	
+	fixedim \funcname
+	call
+
+	; destroy arguments on stack
+	storesp 0
+	storesp 0	
+	 
+	im 0
+	load
+	
+	; poke the result into the right slot
+	storesp 24
+
+	; restore R2
+	im 8
+	store
+	
+	; restore R1
+	im 4
+	store
+	
+	; restore r0
+	im 0
+	store
+	
+	
+	storesp 4
+	poppc
+	.endm
+
+	.macro mult1bit
+	; create mask of lowest bit in A
+	loadsp 8 ; A
+	im 1
+	and
+	im -1
+	add
+	not
+	loadsp 8 ; B
+	and 
+	add ; accumulate in C
+	
+	; shift B left 1 bit
+	loadsp 4 ; B
+	addsp 0
+	storesp 8 ; B
+	
+	; shift A right 1 bit
+	loadsp 8 ; A
+	flip
+	addsp 0
+	flip
+	storesp 12 ; A
+	.endm
+
+
+
+/* vectors */
+        .balign 32,0
+# offset 0x0000 0000
+		.globl _start
+_start:
+		; intSp must be 0 when we jump to _premain
+		
+		im ZPU_ID
+		loadsp 0
+		im _cpu_config
+		store
+		config
+		jmp _premain
+
+
+
+        .balign 32,0
+# offset 0x0000 0020
+		.globl _zpu_interrupt_vector
+_zpu_interrupt_vector:
+		jsr _zpu_interrupt
+		poppc
+
+
+/* instruction emulation code */
+
+# opcode 34
+# offset 0x0000 0040
+	.balign 32,0
+_loadh:
+	loadsp 4
+	; by not masking out bit 0, we cause a memory access error 
+	; on unaligned access
+	im ~0x2
+	and
+	load
+
+	; mult 8	
+	loadsp 8
+	im 3
+	and
+	fast_neg
+	im 2
+	add
+	im 3
+	ashiftleft
+	; shift right addr&3 * 8
+	lshiftright
+	im 0xffff
+	and
+	storesp 8
+	
+	poppc
+
+# opcode 35
+# offset 0x0000 0060
+	.balign 32,0
+_storeh:
+	loadsp 4
+	; by not masking out bit 0, we cause a memory access error 
+	; on unaligned access
+	im ~0x2
+	and
+	load
+
+	; mask
+	im 0xffff
+	loadsp 12
+	im 3
+	and
+	fast_neg
+	im 2
+	add
+	im 3
+	ashiftleft
+	ashiftleft
+	not
+
+	and
+
+	loadsp 12
+	im 0xffff
+
+	nop
+		
+	fixedim _storehtail
+	poppc
+
+
+# opcode 36
+# offset 0x0000 0080
+	.balign 32,0
+_lessthan:
+	loadsp 8
+	fast_neg
+	loadsp 8
+	add
+
+	; DANGER!!!!
+	; 0x80000000 will overflow when negated, so we need to mask
+	; the result above with the compare positive to negative
+	; number case
+	loadsp 12
+	loadsp 12
+	not
+	and
+	not
+	and
+
+
+	; handle case where we are comparing a negative number
+	; and positve number. This can underflow. E.g. consider 0x8000000 < 0x1000
+	loadsp 12
+	not
+	loadsp 12
+	and
+	
+	or
+
+
+
+	flip
+	im 1
+	and	
+
+	
+	storesp 12
+	storesp 4
+	poppc
+	
+
+# opcode 37
+# offset 0x0000 00a0
+	.balign 32,0
+_lessthanorequal:
+	loadsp 8
+	loadsp 8
+	lessthan
+	loadsp 12
+	loadsp 12
+	eq
+	or
+	
+	storesp 12
+	storesp 4
+	poppc
+
+	
+# opcode 38
+# offset 0x0000 00c0
+	.balign 32,0
+_ulessthan:
+	; fish up arguments 
+	loadsp 4
+	loadsp 12
+	
+	/* low: -1 if low bit dif is negative 0 otherwise:  neg (not x&1 and (y&1))
+		x&1		y&1		neg (not x&1 and (y&1))
+		1		1		0
+		1		0 		0
+		0		1		-1
+		0		0		0
+	
+	*/
+	loadsp 4 
+	not
+	loadsp 4
+	and
+	im 1
+	and
+	neg
+	
+	
+	/* high: upper 31-bit diff is only wrong when diff is 0 and low=-1
+		high=x>>1 - y>>1 + low
+		
+		extremes
+		
+		0000 - 1111:
+		low= neg(not 0 and 1) = 1111 (-1)
+		high=000+ neg(111) +low = 000 + 1001 + low = 1000 
+		OK
+		
+		1111 - 0000
+		low=neg(not 1 and 0) = 0
+		high=111+neg(000) + low = 0111
+		OK
+		 
+		
+	 */
+	loadsp 8
+	
+	flip 
+	addsp 0
+	flip
+	
+	loadsp 8
+	
+	flip	
+	addsp 0
+	flip
+
+	sub
+
+	; if they are equal, then the last bit decides...	
+	add
+	
+	/* test if negative: result = flip(diff) & 1 */
+	flip
+	im 1
+	and
+
+	; destroy a&b which are on stack	
+	storesp 4
+	storesp 4
+	
+	storesp 12
+	storesp 4
+	poppc			
+
+# opcode 39
+# offset 0x0000 00e0
+	.balign 32,0
+_ulessthanorequal:
+	loadsp 8
+	loadsp 8
+	ulessthan
+	loadsp 12
+	loadsp 12
+	eq
+	or
+	
+	storesp 12
+	storesp 4
+	poppc
+
+
+# opcode 40
+# offset 0x0000 0100
+	.balign 32,0
+	.globl _swap
+_swap:
+	breakpoint ; tbd
+
+# opcode 41
+# offset 0x0000 0120
+	.balign 32,0
+_slowmult:
+	im _slowmultImpl
+	poppc
+
+# opcode 42
+# offset 0x0000 0140
+	.balign 32,0
+_lshiftright:
+	loadsp 8
+	flip
+
+	loadsp 8
+	ashiftleft
+	flip
+	
+	storesp 12
+	storesp 4
+
+	poppc
+	
+
+# opcode 43
+# offset 0x0000 0160
+	.balign 32,0
+_ashiftleft:
+	loadsp 8
+	
+	loadsp 8
+	im 0x1f
+	and
+	fast_neg
+	im _ashiftleftEnd
+	add
+	poppc
+	
+	
+	
+# opcode 44
+# offset 0x0000 0180
+	.balign 32,0
+_ashiftright:
+	loadsp 8
+	loadsp 8
+	lshiftright
+	
+	; handle signed value
+	im -1
+	loadsp 12
+	im 0x1f
+	and
+	lshiftright
+	not	; now we have an integer on the stack with the signed 
+		; bits in the right position
+
+	; mask these bits with the signed bit.
+	loadsp 16
+	not
+	flip
+	im 1
+	and
+	im -1
+	add
+	
+	and	
+	
+	; stuff in the signed bits...
+	or
+	
+	; store result into correct stack slot	
+	storesp 12
+	
+	; move up return value 
+	storesp 4
+	poppc
+
+# opcode 45
+# offset 0x0000 01a0
+	.balign 32,0
+_call:
+	; fn
+	loadsp 4
+	
+	; return address
+	loadsp 4
+
+	; store return address
+	storesp 12
+	
+	; fn to call
+	storesp 4
+	
+	pushsp	; flush internal stack
+	popsp
+		
+	poppc
+
+_storehtail:
+
+	and
+	loadsp 12
+	im 3
+	and
+	fast_neg
+	im 2
+	add
+	im 3
+	ashiftleft
+	nop
+	ashiftleft
+	
+	or
+	
+	loadsp 8
+	im  ~0x3
+	and
+
+	store
+	
+	storesp 4
+	storesp 4
+	poppc
+
+
+# opcode 46
+# offset 0x0000 01c0
+	.balign 32,0
+_eq:
+	loadsp 8
+	fast_neg
+	loadsp 8
+	add
+	
+	not 
+	loadsp 0
+	im 1
+	add
+	not
+	and
+	flip
+	im 1
+	and
+	
+	storesp 12
+	storesp 4
+	poppc
+
+# opcode 47
+# offset 0x0000 01e0
+	.balign 32,0
+_neq:
+	loadsp 8
+	fast_neg
+	loadsp 8
+	add
+	
+	not 
+	loadsp 0
+	im 1
+	add
+	not
+	and
+	flip
+
+	not
+
+	im 1
+	and
+		
+	storesp 12
+	storesp 4
+	poppc
+	
+
+# opcode 48
+# offset 0x0000 0200
+	.balign 32,0
+_neg:
+	loadsp 4
+	not
+	im 1
+	add
+	storesp 8
+	
+	poppc
+	
+
+# opcode 49
+# offset 0x0000 0220
+	.balign 32,0
+_sub:
+	loadsp 8
+	loadsp 8
+	fast_neg
+	add
+	storesp 12
+
+	storesp 4
+
+	poppc
+
+
+# opcode 50
+# offset 0x0000 0240
+	.balign 32,0
+_xor:
+	loadsp 8
+	not
+	loadsp 8
+	and
+	
+	loadsp 12
+	loadsp 12
+	not
+	and
+
+	or
+
+	storesp 12
+	storesp 4
+	poppc
+
+# opcode 51
+# offset 0x0000 0260
+	.balign 32,0
+_loadb:
+	loadsp 4
+	im ~0x3
+	and
+	load
+
+	loadsp 8
+	im 3
+	and
+	fast_neg
+	im 3
+	add
+	; x8
+	addsp 0
+	addsp 0
+	addsp 0
+
+	lshiftright
+
+	im 0xff
+	and
+	storesp 8
+	
+	poppc
+
+
+# opcode 52
+# offset 0x0000 0280
+	.balign 32,0
+_storeb:
+	loadsp 4
+	im ~0x3
+	and
+	load
+
+	; mask away destination
+	im _mask
+	loadsp 12
+	im 3
+	and
+	addsp 0
+	addsp 0
+	add
+	load
+
+	and
+
+
+	im _storebtail
+	poppc
+	
+# opcode 53
+# offset 0x0000 02a0
+	.balign 32,0
+_div:
+	cimpl __divsi3
+	
+# opcode 54
+# offset 0x0000 02c0
+	.balign 32,0
+_mod:
+	cimpl __modsi3
+
+# opcode 55
+# offset 0x0000 02e0
+	.balign 32,0
+	.globl _eqbranch
+_eqbranch:
+	loadsp 8
+	
+	; eq
+
+	not 
+	loadsp 0
+	im 1
+	add
+	not
+	and
+	flip
+	im 1
+	and
+
+	; mask
+	im -1
+	add
+	loadsp 0
+	storesp 16
+
+	; no branch address
+	loadsp 4
+	
+	and
+
+	; fetch boolean & neg mask
+	loadsp 12
+	not
+	
+	; calc address & mask for branch
+	loadsp 8
+	loadsp 16
+	add
+	; subtract 1 to find PC of branch instruction
+	im -1
+	add
+	
+	and
+
+	or	
+	
+	storesp 4
+	storesp 4
+	storesp 4
+	poppc	
+
+
+# opcode 56
+# offset 0x0000 0300
+	.balign 32,0
+	.globl _neqbranch
+_neqbranch:
+	loadsp 8
+	
+	; neq
+
+	not 
+	loadsp 0
+	im 1
+	add
+	not
+	and
+	flip
+	
+	not
+	
+	im 1
+	and
+
+	; mask
+	im -1
+	add
+	loadsp 0
+	storesp 16
+
+	; no branch address
+	loadsp 4
+	
+	and
+
+	; fetch boolean & neg mask
+	loadsp 12
+	not
+	
+	; calc address & mask for branch
+	loadsp 8
+	loadsp 16
+	add
+	; find address of branch instruction
+	im -1
+	add
+	
+	and
+
+	or	
+	
+	storesp 4
+	storesp 4
+	storesp 4
+	poppc	
+
+# opcode 57
+# offset 0x0000 0320
+	.balign 32,0
+	.globl _poppcrel
+_poppcrel:
+	add
+	; address of poppcrel
+	im -1
+	add
+	poppc
+		
+# opcode 58
+# offset 0x0000 0340
+	.balign 32,0
+	.globl _config
+_config:
+	im 1 
+	nop
+	im _hardware
+	store
+	storesp 4
+	poppc
+
+# opcode 59
+# offset 0x0000 0360
+	.balign 32,0
+_pushpc:
+	loadsp 4
+	im 1
+	add 
+	storesp 8
+	poppc
+	
+# opcode 60
+# offset 0x0000 0380
+	.balign 32,0
+_syscall_emulate:
+	.byte 0
+	
+# opcode 61
+# offset 0x0000 03a0
+	.balign 32,0
+_pushspadd:
+	pushsp
+	im 4
+	add
+	loadsp 8
+	addsp 0
+	addsp 0
+	add
+	storesp 8
+	
+	poppc
+
+# opcode 62
+# offset 0x0000 03c0
+	.balign 32,0
+_halfmult:
+	breakpoint
+	
+# opcode 63
+# offset 0x0000 03e0
+	.balign 32,0
+_callpcrel:
+	loadsp 4
+	loadsp 4
+	add
+	im -1
+	add
+	loadsp 4
+	
+	storesp 12	; return address
+	storesp 4 
+	pushsp		; this will flush the internal stack.
+	popsp
+	poppc
+
+	.text
+
+	
+
+
+_ashiftleftBegin:
+	.rept 0x1f
+	addsp 0
+	.endr
+_ashiftleftEnd:
+	storesp 12
+	storesp 4
+	poppc
+	
+_storebtail:
+	loadsp 12
+	im 0xff
+	and
+	loadsp 12
+	im 3
+	and
+
+	fast_neg
+	im 3
+	add
+	; x8
+	addsp 0
+	addsp 0
+	addsp 0
+
+	ashiftleft
+	 
+	or
+	
+	loadsp 8
+	im  ~0x3
+	and
+
+	store
+	
+	storesp 4
+	storesp 4
+	poppc
+	
+
+
+
+; NB! this is not an EMULATE instruction. It is a varargs fn.
+	.globl _syscall	
+_syscall:
+	syscall
+	poppc
+	
+_slowmultImpl:
+	
+	loadsp 8 ; A
+	loadsp 8 ; B
+	im 0 ; C
+
+.LmoreMult:
+	mult1bit
+	
+	; cutoff
+	loadsp 8
+	.byte (.LmoreMult-.Lbranch)&0x7f+0x80
+.Lbranch:
+	neqbranch
+
+	storesp 4
+	storesp 4
+	storesp 12
+	storesp 4
+	poppc
+	
+	.data
+	.balign 4,0
+_mask:
+	.long 0x00ffffff
+	.long 0xff00ffff
+	.long 0xffff00ff
+	.long 0xffffff00
+
+	
+	.globl _hardware
+_hardware:
+	.long 0
+	.globl _cpu_config
+_cpu_config:
+	.long 0
+
diff --git a/zpu/sw/startup/crt_io.c b/zpu/sw/startup/crt_io.c
new file mode 100644
index 0000000..966ae33
--- /dev/null
+++ b/zpu/sw/startup/crt_io.c
@@ -0,0 +1,91 @@
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+extern int _hardware;
+/* _cpu_config==0 => Abel
+ * _cpu_config==1 => Zeta
+ * _cpu_config==2 => Phi
+ */
+extern int _cpu_config;
+static volatile int *UART;
+static volatile int *TIMER;
+volatile int *MHZ;
+
+
+
+/*
+ * Wait indefinitely for input byte
+ */
+
+
+int __attribute__ ((weak)) inbyte()  
+{
+	int val;
+	for (;;)
+	{
+		val=UART[1];
+		if ((val&0x100)!=0)
+		{
+			return val&0xff;
+		}
+	}
+}
+
+
+
+/* 
+ * Output one character to the serial port 
+ * 
+ * 
+ */
+void __attribute__ ((weak)) outbyte(int c)  
+{
+	/* Wait for space in FIFO */
+	while ((UART[0]&0x100)==0);
+	UART[0]=c;
+}
+
+static const int mhz=64;
+
+void __attribute__ ((weak)) _initIO(void)  
+{
+	if (_hardware)
+	{
+		if (_cpu_config==2)
+		{
+			/* Phi board addresses */
+			UART=(volatile int *)0x080a000c;
+			TIMER=(volatile int *)0x080a0014; 
+			MHZ=(volatile int *)&mhz; 
+		} else 
+		{
+			/* Abel board */
+			UART=(volatile int *)0xc000;
+			TIMER=(volatile int *)0x9000;
+			MHZ=(volatile int *)0x8800;
+		}
+	} else
+	{
+		UART=(volatile int *)0x80000024;
+		TIMER=(volatile int *)0x80000100;
+		MHZ=(volatile int *)0x80000200;
+	}
+}
+
+
+
+long long __attribute__ ((weak)) _readCycles()  
+{
+	long long clock;
+	unsigned int i;
+	
+	TIMER[0]=0x2; /* sample timer */
+	clock=0;
+	for (i=0; i<2; i++)
+	{
+		clock|=((long long )(TIMER[i]))<<(i*32);
+	}
+	return clock;
+}
diff --git a/zpu/sw/startup/time.c b/zpu/sw/startup/time.c
new file mode 100644
index 0000000..767b62f
--- /dev/null
+++ b/zpu/sw/startup/time.c
@@ -0,0 +1,32 @@
+#include <_ansi.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+extern long long _readCycles();
+
+
+extern volatile int *MHZ;
+
+long long _readMicroseconds()
+{
+	int Hz;
+	long long clock;
+	Hz=(*MHZ&0xff);
+	clock=_readCycles();
+	return clock/(long long)Hz;
+}
+
+
+
+
+time_t
+time (time_t *tloc)
+{
+	time_t t;
+	t=(time_t)(_readMicroseconds()/(long long )1000000);
+	if (tloc!=NULL)
+	{
+		*tloc=t;
+	}
+	return t;
+}
-- 
cgit v1.1