/* | |

* Copyright (C) 2003-2013 Altera Corporation | |

* All rights reserved. | |

* | |

* This program is free software; you can redistribute it and/or modify | |

* it under the terms of the GNU General Public License as published by | |

* the Free Software Foundation; either version 2 of the License, or | |

* (at your option) any later version. | |

* | |

* This program is distributed in the hope that it will be useful, | |

* but WITHOUT ANY WARRANTY; without even the implied warranty of | |

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |

* GNU General Public License for more details. | |

* | |

* You should have received a copy of the GNU General Public License | |

* along with this program. If not, see <http://www.gnu.org/licenses/>. | |

*/ | |

#include <linux/linkage.h> | |

#include <asm/entry.h> | |

.set noat | |

.set nobreak | |

/* | |

* Explicitly allow the use of r1 (the assembler temporary register) | |

* within this code. This register is normally reserved for the use of | |

* the compiler. | |

*/ | |

ENTRY(instruction_trap) | |

ldw r1, PT_R1(sp) // Restore registers | |

ldw r2, PT_R2(sp) | |

ldw r3, PT_R3(sp) | |

ldw r4, PT_R4(sp) | |

ldw r5, PT_R5(sp) | |

ldw r6, PT_R6(sp) | |

ldw r7, PT_R7(sp) | |

ldw r8, PT_R8(sp) | |

ldw r9, PT_R9(sp) | |

ldw r10, PT_R10(sp) | |

ldw r11, PT_R11(sp) | |

ldw r12, PT_R12(sp) | |

ldw r13, PT_R13(sp) | |

ldw r14, PT_R14(sp) | |

ldw r15, PT_R15(sp) | |

ldw ra, PT_RA(sp) | |

ldw fp, PT_FP(sp) | |

ldw gp, PT_GP(sp) | |

ldw et, PT_ESTATUS(sp) | |

wrctl estatus, et | |

ldw ea, PT_EA(sp) | |

ldw et, PT_SP(sp) /* backup sp in et */ | |

addi sp, sp, PT_REGS_SIZE | |

/* INSTRUCTION EMULATION | |

* --------------------- | |

* | |

* Nios II processors generate exceptions for unimplemented instructions. | |

* The routines below emulate these instructions. Depending on the | |

* processor core, the only instructions that might need to be emulated | |

* are div, divu, mul, muli, mulxss, mulxsu, and mulxuu. | |

* | |

* The emulations match the instructions, except for the following | |

* limitations: | |

* | |

* 1) The emulation routines do not emulate the use of the exception | |

* temporary register (et) as a source operand because the exception | |

* handler already has modified it. | |

* | |

* 2) The routines do not emulate the use of the stack pointer (sp) or | |

* the exception return address register (ea) as a destination because | |

* modifying these registers crashes the exception handler or the | |

* interrupted routine. | |

* | |

* Detailed Design | |

* --------------- | |

* | |

* The emulation routines expect the contents of integer registers r0-r31 | |

* to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp). The | |

* routines retrieve source operands from the stack and modify the | |

* destination register's value on the stack prior to the end of the | |

* exception handler. Then all registers except the destination register | |

* are restored to their previous values. | |

* | |

* The instruction that causes the exception is found at address -4(ea). | |

* The instruction's OP and OPX fields identify the operation to be | |

* performed. | |

* | |

* One instruction, muli, is an I-type instruction that is identified by | |

* an OP field of 0x24. | |

* | |

* muli AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24- | |

* 27 22 6 0 <-- LSB of field | |

* | |

* The remaining emulated instructions are R-type and have an OP field | |

* of 0x3a. Their OPX fields identify them. | |

* | |

* R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a- | |

* 27 22 17 11 6 0 <-- LSB of field | |

* | |

* | |

* Opcode Encoding. muli is identified by its OP value. Then OPX & 0x02 | |

* is used to differentiate between the division opcodes and the | |

* remaining multiplication opcodes. | |

* | |

* Instruction OP OPX OPX & 0x02 | |

* ----------- ---- ---- ---------- | |

* muli 0x24 | |

* divu 0x3a 0x24 0 | |

* div 0x3a 0x25 0 | |

* mul 0x3a 0x27 != 0 | |

* mulxuu 0x3a 0x07 != 0 | |

* mulxsu 0x3a 0x17 != 0 | |

* mulxss 0x3a 0x1f != 0 | |

*/ | |

/* | |

* Save everything on the stack to make it easy for the emulation | |

* routines to retrieve the source register operands. | |

*/ | |

addi sp, sp, -128 | |

stw zero, 0(sp) /* Save zero on stack to avoid special case for r0. */ | |

stw r1, 4(sp) | |

stw r2, 8(sp) | |

stw r3, 12(sp) | |

stw r4, 16(sp) | |

stw r5, 20(sp) | |

stw r6, 24(sp) | |

stw r7, 28(sp) | |

stw r8, 32(sp) | |

stw r9, 36(sp) | |

stw r10, 40(sp) | |

stw r11, 44(sp) | |

stw r12, 48(sp) | |

stw r13, 52(sp) | |

stw r14, 56(sp) | |

stw r15, 60(sp) | |

stw r16, 64(sp) | |

stw r17, 68(sp) | |

stw r18, 72(sp) | |

stw r19, 76(sp) | |

stw r20, 80(sp) | |

stw r21, 84(sp) | |

stw r22, 88(sp) | |

stw r23, 92(sp) | |

/* Don't bother to save et. It's already been changed. */ | |

rdctl r5, estatus | |

stw r5, 100(sp) | |

stw gp, 104(sp) | |

stw et, 108(sp) /* et contains previous sp value. */ | |

stw fp, 112(sp) | |

stw ea, 116(sp) | |

stw ra, 120(sp) | |

/* | |

* Split the instruction into its fields. We need 4*A, 4*B, and 4*C as | |

* offsets to the stack pointer for access to the stored register values. | |

*/ | |

ldw r2,-4(ea) /* r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP */ | |

roli r3, r2, 7 /* r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB */ | |

roli r4, r3, 3 /* r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB */ | |

roli r5, r4, 2 /* r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II */ | |

srai r4, r4, 16 /* r4 = (sign-extended) IMM16 */ | |

roli r6, r5, 5 /* r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX */ | |

andi r2, r2, 0x3f /* r2 = 00000000000000000000000000,PPPPPP */ | |

andi r3, r3, 0x7c /* r3 = 0000000000000000000000000,AAAAA,00 */ | |

andi r5, r5, 0x7c /* r5 = 0000000000000000000000000,BBBBB,00 */ | |

andi r6, r6, 0x7c /* r6 = 0000000000000000000000000,CCCCC,00 */ | |

/* Now | |

* r2 = OP | |

* r3 = 4*A | |

* r4 = IMM16 (sign extended) | |

* r5 = 4*B | |

* r6 = 4*C | |

*/ | |

/* | |

* Get the operands. | |

* | |

* It is necessary to check for muli because it uses an I-type | |

* instruction format, while the other instructions are have an R-type | |

* format. | |

* | |

* Prepare for either multiplication or division loop. | |

* They both loop 32 times. | |

*/ | |

movi r14, 32 | |

add r3, r3, sp /* r3 = address of A-operand. */ | |

ldw r3, 0(r3) /* r3 = A-operand. */ | |

movi r7, 0x24 /* muli opcode (I-type instruction format) */ | |

beq r2, r7, mul_immed /* muli doesn't use the B register as a source */ | |

add r5, r5, sp /* r5 = address of B-operand. */ | |

ldw r5, 0(r5) /* r5 = B-operand. */ | |

/* r4 = SSSSSSSSSSSSSSSS,-----IMM16------ */ | |

/* IMM16 not needed, align OPX portion */ | |

/* r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 */ | |

srli r4, r4, 5 /* r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- */ | |

andi r4, r4, 0x3f /* r4 = 00000000000000000000000000,-OPX-- */ | |

/* Now | |

* r2 = OP | |

* r3 = src1 | |

* r5 = src2 | |

* r4 = OPX (no longer can be muli) | |

* r6 = 4*C | |

*/ | |

/* | |

* Multiply or Divide? | |

*/ | |

andi r7, r4, 0x02 /* For R-type multiply instructions, | |

OPX & 0x02 != 0 */ | |

bne r7, zero, multiply | |

/* DIVISION | |

* | |

* Divide an unsigned dividend by an unsigned divisor using | |

* a shift-and-subtract algorithm. The example below shows | |

* 43 div 7 = 6 for 8-bit integers. This classic algorithm uses a | |

* single register to store both the dividend and the quotient, | |

* allowing both values to be shifted with a single instruction. | |

* | |

* remainder dividend:quotient | |

* --------- ----------------- | |

* initialize 00000000 00101011: | |

* shift 00000000 0101011:_ | |

* remainder >= divisor? no 00000000 0101011:0 | |

* shift 00000000 101011:0_ | |

* remainder >= divisor? no 00000000 101011:00 | |

* shift 00000001 01011:00_ | |

* remainder >= divisor? no 00000001 01011:000 | |

* shift 00000010 1011:000_ | |

* remainder >= divisor? no 00000010 1011:0000 | |

* shift 00000101 011:0000_ | |

* remainder >= divisor? no 00000101 011:00000 | |

* shift 00001010 11:00000_ | |

* remainder >= divisor? yes 00001010 11:000001 | |

* remainder -= divisor - 00000111 | |

* ---------- | |

* 00000011 11:000001 | |

* shift 00000111 1:000001_ | |

* remainder >= divisor? yes 00000111 1:0000011 | |

* remainder -= divisor - 00000111 | |

* ---------- | |

* 00000000 1:0000011 | |

* shift 00000001 :0000011_ | |

* remainder >= divisor? no 00000001 :00000110 | |

* | |

* The quotient is 00000110. | |

*/ | |

divide: | |

/* | |

* Prepare for division by assuming the result | |

* is unsigned, and storing its "sign" as 0. | |

*/ | |

movi r17, 0 | |

/* Which division opcode? */ | |

xori r7, r4, 0x25 /* OPX of div */ | |

bne r7, zero, unsigned_division | |

/* | |

* OPX is div. Determine and store the sign of the quotient. | |

* Then take the absolute value of both operands. | |

*/ | |

xor r17, r3, r5 /* MSB contains sign of quotient */ | |

bge r3,zero,dividend_is_nonnegative | |

sub r3, zero, r3 /* -r3 */ | |

dividend_is_nonnegative: | |

bge r5, zero, divisor_is_nonnegative | |

sub r5, zero, r5 /* -r5 */ | |

divisor_is_nonnegative: | |

unsigned_division: | |

/* Initialize the unsigned-division loop. */ | |

movi r13, 0 /* remainder = 0 */ | |

/* Now | |

* r3 = dividend : quotient | |

* r4 = 0x25 for div, 0x24 for divu | |

* r5 = divisor | |

* r13 = remainder | |

* r14 = loop counter (already initialized to 32) | |

* r17 = MSB contains sign of quotient | |

*/ | |

/* | |

* for (count = 32; count > 0; --count) | |

* { | |

*/ | |

divide_loop: | |

/* | |

* Division: | |

* | |

* (remainder:dividend:quotient) <<= 1; | |

*/ | |

slli r13, r13, 1 | |

cmplt r7, r3, zero /* r7 = MSB of r3 */ | |

or r13, r13, r7 | |

slli r3, r3, 1 | |

/* | |

* if (remainder >= divisor) | |

* { | |

* set LSB of quotient | |

* remainder -= divisor; | |

* } | |

*/ | |

bltu r13, r5, div_skip | |

ori r3, r3, 1 | |

sub r13, r13, r5 | |

div_skip: | |

/* | |

* } | |

*/ | |

subi r14, r14, 1 | |

bne r14, zero, divide_loop | |

/* Now | |

* r3 = quotient | |

* r4 = 0x25 for div, 0x24 for divu | |

* r6 = 4*C | |

* r17 = MSB contains sign of quotient | |

*/ | |

/* | |

* Conditionally negate signed quotient. If quotient is unsigned, | |

* the sign already is initialized to 0. | |

*/ | |

bge r17, zero, quotient_is_nonnegative | |

sub r3, zero, r3 /* -r3 */ | |

quotient_is_nonnegative: | |

/* | |

* Final quotient is in r3. | |

*/ | |

add r6, r6, sp | |

stw r3, 0(r6) /* write quotient to stack */ | |

br restore_registers | |

/* MULTIPLICATION | |

* | |

* A "product" is the number that one gets by summing a "multiplicand" | |

* several times. The "multiplier" specifies the number of copies of the | |

* multiplicand that are summed. | |

* | |

* Actual multiplication algorithms don't use repeated addition, however. | |

* Shift-and-add algorithms get the same answer as repeated addition, and | |

* they are faster. To compute the lower half of a product (pppp below) | |

* one shifts the product left before adding in each of the partial | |

* products (a * mmmm) through (d * mmmm). | |

* | |

* To compute the upper half of a product (PPPP below), one adds in the | |

* partial products (d * mmmm) through (a * mmmm), each time following | |

* the add by a right shift of the product. | |

* | |

* mmmm | |

* * abcd | |

* ------ | |

* #### = d * mmmm | |

* #### = c * mmmm | |

* #### = b * mmmm | |

* #### = a * mmmm | |

* -------- | |

* PPPPpppp | |

* | |

* The example above shows 4 partial products. Computing actual Nios II | |

* products requires 32 partials. | |

* | |

* It is possible to compute the result of mulxsu from the result of | |

* mulxuu because the only difference between the results of these two | |

* opcodes is the value of the partial product associated with the sign | |

* bit of rA. | |

* | |

* mulxsu = mulxuu - (rA < 0) ? rB : 0; | |

* | |

* It is possible to compute the result of mulxss from the result of | |

* mulxsu because the only difference between the results of these two | |

* opcodes is the value of the partial product associated with the sign | |

* bit of rB. | |

* | |

* mulxss = mulxsu - (rB < 0) ? rA : 0; | |

* | |

*/ | |

mul_immed: | |

/* Opcode is muli. Change it into mul for remainder of algorithm. */ | |

mov r6, r5 /* Field B is dest register, not field C. */ | |

mov r5, r4 /* Field IMM16 is src2, not field B. */ | |

movi r4, 0x27 /* OPX of mul is 0x27 */ | |

multiply: | |

/* Initialize the multiplication loop. */ | |

movi r9, 0 /* mul_product = 0 */ | |

movi r10, 0 /* mulxuu_product = 0 */ | |

mov r11, r5 /* save original multiplier for mulxsu and mulxss */ | |

mov r12, r5 /* mulxuu_multiplier (will be shifted) */ | |

movi r16, 1 /* used to create "rori B,A,1" from "ror B,A,r16" */ | |

/* Now | |

* r3 = multiplicand | |

* r5 = mul_multiplier | |

* r6 = 4 * dest_register (used later as offset to sp) | |

* r7 = temp | |

* r9 = mul_product | |

* r10 = mulxuu_product | |

* r11 = original multiplier | |

* r12 = mulxuu_multiplier | |

* r14 = loop counter (already initialized) | |

* r16 = 1 | |

*/ | |

/* | |

* for (count = 32; count > 0; --count) | |

* { | |

*/ | |

multiply_loop: | |

/* | |

* mul_product <<= 1; | |

* lsb = multiplier & 1; | |

*/ | |

slli r9, r9, 1 | |

andi r7, r12, 1 | |

/* | |

* if (lsb == 1) | |

* { | |

* mulxuu_product += multiplicand; | |

* } | |

*/ | |

beq r7, zero, mulx_skip | |

add r10, r10, r3 | |

cmpltu r7, r10, r3 /* Save the carry from the MSB of mulxuu_product. */ | |

ror r7, r7, r16 /* r7 = 0x80000000 on carry, or else 0x00000000 */ | |

mulx_skip: | |

/* | |

* if (MSB of mul_multiplier == 1) | |

* { | |

* mul_product += multiplicand; | |

* } | |

*/ | |

bge r5, zero, mul_skip | |

add r9, r9, r3 | |

mul_skip: | |

/* | |

* mulxuu_product >>= 1; logical shift | |

* mul_multiplier <<= 1; done with MSB | |

* mulx_multiplier >>= 1; done with LSB | |

*/ | |

srli r10, r10, 1 | |

or r10, r10, r7 /* OR in the saved carry bit. */ | |

slli r5, r5, 1 | |

srli r12, r12, 1 | |

/* | |

* } | |

*/ | |

subi r14, r14, 1 | |

bne r14, zero, multiply_loop | |

/* | |

* Multiply emulation loop done. | |

*/ | |

/* Now | |

* r3 = multiplicand | |

* r4 = OPX | |

* r6 = 4 * dest_register (used later as offset to sp) | |

* r7 = temp | |

* r9 = mul_product | |

* r10 = mulxuu_product | |

* r11 = original multiplier | |

*/ | |

/* Calculate address for result from 4 * dest_register */ | |

add r6, r6, sp | |

/* | |

* Select/compute the result based on OPX. | |

*/ | |

/* OPX == mul? Then store. */ | |

xori r7, r4, 0x27 | |

beq r7, zero, store_product | |

/* It's one of the mulx.. opcodes. Move over the result. */ | |

mov r9, r10 | |

/* OPX == mulxuu? Then store. */ | |

xori r7, r4, 0x07 | |

beq r7, zero, store_product | |

/* Compute mulxsu | |

* | |

* mulxsu = mulxuu - (rA < 0) ? rB : 0; | |

*/ | |

bge r3, zero, mulxsu_skip | |

sub r9, r9, r11 | |

mulxsu_skip: | |

/* OPX == mulxsu? Then store. */ | |

xori r7, r4, 0x17 | |

beq r7, zero, store_product | |

/* Compute mulxss | |

* | |

* mulxss = mulxsu - (rB < 0) ? rA : 0; | |

*/ | |

bge r11,zero,mulxss_skip | |

sub r9, r9, r3 | |

mulxss_skip: | |

/* At this point, assume that OPX is mulxss, so store*/ | |

store_product: | |

stw r9, 0(r6) | |

restore_registers: | |

/* No need to restore r0. */ | |

ldw r5, 100(sp) | |

wrctl estatus, r5 | |

ldw r1, 4(sp) | |

ldw r2, 8(sp) | |

ldw r3, 12(sp) | |

ldw r4, 16(sp) | |

ldw r5, 20(sp) | |

ldw r6, 24(sp) | |

ldw r7, 28(sp) | |

ldw r8, 32(sp) | |

ldw r9, 36(sp) | |

ldw r10, 40(sp) | |

ldw r11, 44(sp) | |

ldw r12, 48(sp) | |

ldw r13, 52(sp) | |

ldw r14, 56(sp) | |

ldw r15, 60(sp) | |

ldw r16, 64(sp) | |

ldw r17, 68(sp) | |

ldw r18, 72(sp) | |

ldw r19, 76(sp) | |

ldw r20, 80(sp) | |

ldw r21, 84(sp) | |

ldw r22, 88(sp) | |

ldw r23, 92(sp) | |

/* Does not need to restore et */ | |

ldw gp, 104(sp) | |

ldw fp, 112(sp) | |

ldw ea, 116(sp) | |

ldw ra, 120(sp) | |

ldw sp, 108(sp) /* last restore sp */ | |

eret | |

.set at | |

.set break |