From 74379e0ef3bc4433328a78d88b005cb097f87a3f Mon Sep 17 00:00:00 2001 From: Lasse Agentoft Eggen Date: Fri, 20 Sep 2019 12:50:33 +0200 Subject: [PATCH] Fix build issues for external users * .gitmodules compatibility * Add a few dependencies to `apt-get install` command * Fix some path issues System-specific paths to $SS* environment variables * Do not ignore intrinsic macros * Remove direct references to gams scheduler * General fixes to python scripts for some of the workloads * Do not treat warnings as errors (gem5 build) --- .gitmodules | 21 + Makefile | 6 +- README.md | 8 +- gem5/SConstruct | 12 +- gem5/src/base/bitunion.hh | 4 +- riscv-gnu-toolchain/.gitmodules | 21 - riscv-opcodes/.gitignore | 1 - riscv-opcodes/parse-opcodes | 2 +- riscv-opcodes/ss_insts.h | 429 ++++++++++++++++++ ss-scheduler/drivers/ss_sched.cpp | 3 +- ss-scheduler/src/scheduler/Makefile | 14 +- ss-workloads/dsp-benchmarks/centro-fir/gen.py | 4 +- .../dsp-benchmarks/common/Makefile.inc | 6 +- ss-workloads/dsp-benchmarks/common/output.py | 3 +- ss-workloads/dsp-benchmarks/qr/run.py | 2 +- ss-workloads/test-multi/run-tests.sh | 8 +- ss-workloads/test-single/run-tests.sh | 6 +- 17 files changed, 487 insertions(+), 63 deletions(-) create mode 100644 .gitmodules delete mode 100644 riscv-gnu-toolchain/.gitmodules create mode 100644 riscv-opcodes/ss_insts.h diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..867cb82b6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,21 @@ +#[submodule "riscv-binutils"] +# path = riscv-gnu-toolchain/riscv-binutils +# url = git://github.com/riscv/riscv-binutils-gdb.git +[submodule "riscv-gcc"] + path = riscv-gnu-toolchain/riscv-gcc + url = git://github.com/riscv/riscv-gcc.git +[submodule "riscv-glibc"] + path = riscv-gnu-toolchain/riscv-glibc + url = git://github.com/riscv/riscv-glibc.git +[submodule "riscv-dejagnu"] + path = riscv-gnu-toolchain/riscv-dejagnu + url = git://github.com/riscv/riscv-dejagnu.git +[submodule "riscv-newlib"] + path = riscv-gnu-toolchain/riscv-newlib + url = git://github.com/riscv/riscv-newlib.git +[submodule "riscv-qemu"] + path = riscv-gnu-toolchain/riscv-qemu + url = git://github.com/riscv/riscv-qemu.git +[submodule "riscv-gdb"] + path = riscv-gnu-toolchain/riscv-gdb + url = git://github.com/riscv/riscv-binutils-gdb.git diff --git a/Makefile b/Makefile index c97a0aae6..158feaaba 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ default: build-all include msg.mk +JOBS = 4 + RVTOOLS = riscv-gnu-toolchain riscv-opcodes SS_SCHED = ss-scheduler @@ -20,7 +22,7 @@ build-all: $(MODULES) clean-all: $(CLEAN_MODULES) $(GEM5): ss-scheduler - cd $@; scons build/RISCV/gem5.opt -j7 + cd $@; scons build/RISCV/gem5.opt -j$(JOBS) .PHONY: $(SS_SCHED) $(SS_SCHED): @@ -38,7 +40,7 @@ clean-ss: clean-$(SS_SCHED) riscv-gnu-toolchain: riscv-opcodes mkdir -p $@/build cd $@ && autoreconf -fiv && cd build && ../configure --prefix=$(SS_TOOLS)/ - $(MAKE) -C $@/build -j9 + $(MAKE) -C $@/build -j$(JOBS) riscv-opcodes: make -C $@ install-ss diff --git a/README.md b/README.md index dc2396507..f36a575c5 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ If you are on Ubuntu, try this: sudo apt-get install autoconf automake autotools-dev curl libmpc-dev libmpfr-dev \ libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc \ qt4-dev-tools libqt4-dev python-dev scons libboost-regex-dev \ - libboost-serialization-dev libgoogle-perftools-dev + libboost-serialization-dev libgoogle-perftools-dev libbison-dev parallel ```` ___ @@ -48,7 +48,11 @@ source setup.sh ```` make build-all ```` -NOTE: DO NOT use `-j`, which may cause dependence problem! +NOTE: DO NOT use `-j`, which may cause dependence problem! +You can set JOBS=n to attribute parallelizable build commands. +```` +JOBS=4 make build-all +```` ___ diff --git a/gem5/SConstruct b/gem5/SConstruct index 5209aa6d4..53861d176 100755 --- a/gem5/SConstruct +++ b/gem5/SConstruct @@ -364,10 +364,10 @@ if main['GCC'] or main['CLANG']: # Treat warnings as errors but white list some warnings that we # want to allow (e.g., deprecation warnings). - main.Append(CCFLAGS=['-Werror', - '-Wno-error=deprecated-declarations', - '-Wno-error=deprecated', - ]) + #main.Append(CCFLAGS=['-Werror', + # '-Wno-error=deprecated-declarations', + # '-Wno-error=deprecated', + # ]) else: print(termcap.Yellow + termcap.Bold + 'Error' + termcap.Normal, end=' ') print("Don't know what compiler options to use for your compiler.") @@ -425,10 +425,8 @@ if main['GCC']: # to avoid performance penalties on certain AMD chips. Older # assemblers detect this as an error, "Error: expecting string # instruction after `rep'" - as_version_raw = readCommand([main['AS'], '-v', '/dev/null', - '-o', '/dev/null'], + as_version_raw = readCommand([main['AS'], '-v', '/dev/null'], exception=False).split() - # version strings may contain extra distro-specific # qualifiers, so play it safe and keep only what comes before # the first hyphen diff --git a/gem5/src/base/bitunion.hh b/gem5/src/base/bitunion.hh index 49a956eec..1a32991a8 100644 --- a/gem5/src/base/bitunion.hh +++ b/gem5/src/base/bitunion.hh @@ -435,7 +435,7 @@ namespace BitfieldBackend inline std::ostream & bitfieldBackendPrinter(std::ostream &os, const char &t) { - os << (const int)t; + os << (int)t; return os; } @@ -443,7 +443,7 @@ namespace BitfieldBackend inline std::ostream & bitfieldBackendPrinter(std::ostream &os, const unsigned char &t) { - os << (const unsigned int)t; + os << (unsigned int)t; return os; } } diff --git a/riscv-gnu-toolchain/.gitmodules b/riscv-gnu-toolchain/.gitmodules deleted file mode 100644 index 8172d68db..000000000 --- a/riscv-gnu-toolchain/.gitmodules +++ /dev/null @@ -1,21 +0,0 @@ -[submodule "riscv-binutils"] - path = riscv-binutils - url = ../riscv-binutils-gdb.git -[submodule "riscv-gcc"] - path = riscv-gcc - url = git://github.com/riscv/riscv-gcc.git -[submodule "riscv-glibc"] - path = riscv-glibc - url = git://github.com/riscv/riscv-glibc.git -[submodule "riscv-dejagnu"] - path = riscv-dejagnu - url = git://github.com/riscv/riscv-dejagnu.git -[submodule "riscv-newlib"] - path = riscv-newlib - url = git://github.com/riscv/riscv-newlib.git -[submodule "riscv-qemu"] - path = riscv-qemu - url = git://github.com/riscv/riscv-qemu.git -[submodule "riscv-gdb"] - path = riscv-gdb - url = git://github.com/riscv/riscv-binutils-gdb.git diff --git a/riscv-opcodes/.gitignore b/riscv-opcodes/.gitignore index db778f214..5b578f108 100644 --- a/riscv-opcodes/.gitignore +++ b/riscv-opcodes/.gitignore @@ -2,4 +2,3 @@ inst.chisel instr-table.tex priv-instr-table.tex -ss_*.h diff --git a/riscv-opcodes/parse-opcodes b/riscv-opcodes/parse-opcodes index b324c24c2..e063ce56f 100755 --- a/riscv-opcodes/parse-opcodes +++ b/riscv-opcodes/parse-opcodes @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python2 import math import sys diff --git a/riscv-opcodes/ss_insts.h b/riscv-opcodes/ss_insts.h new file mode 100644 index 000000000..fee3d4bf3 --- /dev/null +++ b/riscv-opcodes/ss_insts.h @@ -0,0 +1,429 @@ +#ifndef SS_INSTS_H +#define SS_INSTS_H + +// Magic sentinal for matching +#define SENTINAL (((uint64_t)1)<<63) +#define SENTINAL16 (((uint16_t)1)<<15) +#define SENTINAL32 (((uint32_t)1)<<31) + +#define REPEAT_FXPNT_BITS (3) +#define REPEAT_FXPNT_VAL (1< 0) { \ + SS_SCR_RD_OUTER(stride, n_strides, stretch); \ + SS_SCR_RD_INNER(scr_addr, acc_size, port); \ + } else { \ + int _addr = scr_addr + acc_size; \ + int _outer_cnt = n_strides; \ + SS_SCR_RD_OUTER(stride, n_strides, stretch); \ + SS_SCR_RD_INNER(_addr, -acc_size, port); \ + } \ + } while (false) + +#define SS_SCR_PORT_STREAM(scr_addr,stride,acc_size,n_strides, port) \ + SS_SCR_PORT_STREAM_STRETCH(scr_addr,stride,acc_size,0,n_strides, port) + +//A convienience command for linear access +#define SS_SCRATCH_READ(scr_addr, n_bytes, port) \ + SS_SCR_PORT_STREAM_STRETCH(scr_addr,0,n_bytes,0,1, port) + +//Read from DMA into a port +#define SS_DMA_READ_STRETCH(mem_addr, stride, acc_size, stretch, n_strides, port ) \ + do { \ + SS_DMA_RD_OUTER(stride, n_strides, stretch); \ + SS_DMA_RD_INNER(mem_addr, acc_size, port); \ + } while (false) + +#define SS_DMA_READ(mem_addr, stride, acc_size, n_strides, port ) \ + SS_DMA_READ_STRETCH(mem_addr, stride, acc_size, 0, n_strides, port ) + +#define SS_DMA_READ_SIMP(mem_addr, num_strides, port ) \ + __asm__ __volatile__("ss_dma_rd %0, %1, %2" : : "r"(mem_addr), "r"(num_strides), "i"(port)); + + +//Throw away some outputs. We will add a proper instruction for this at some point, rather than writing to memory +#define SS_GARBAGE_SIMP(output_port, num_elem) \ + __asm__ __volatile__("ss_wr_dma %0, %1, %2" : : "r"(0), "r"(num_elem), "i"(output_port|0x100)); + + +// Memory Oriented Instructions + +//Set this back to zero if you need different kinds of writes later in the same code!!! +#define SS_GARBAGE_BEFORE_STRIDE(num_garb) \ + __asm__ __volatile__("ss_garb %0, %1, 0" : : "r"(num_garb), "r"(num_garb)); \ + +// Plain Write to Memory +#define SS_DMA_WRITE(output_port, stride, acc_size, n_strides, mem_addr) \ + do { \ + SS_DMA_WR_OUTER(stride, n_strides, 0); \ + SS_DMA_WR_INNER(mem_addr, acc_size, output_port); \ + } while (false) + + +#define SS_DMA_WRITE_SIMP(output_port, num_strides, mem_addr) \ + __asm__ __volatile__("ss_wr_dma %0, %1, %2" : : "r"(mem_addr), "r"(num_strides), "i"(output_port)); + + +//Write to DMA, but throw away all but the last 16-bits from each word +//TODO: make these work with types defined for indirection +#define SS_DMA_WRITE_SHF16(output_port, stride, access_size, num_strides, mem_addr) \ + __asm__ __volatile__("ss_stride %0, %1, 0" : : "r"(stride), "r"(access_size)); \ + __asm__ __volatile__("ss_wr_dma %0, %1, %2" : : "r"(mem_addr), "r"(num_strides), "i"(output_port|0x40)); + +//Write to DMA, but throw away all but the last 32-bits from each word (implemented, not tested yet) +#define SS_DMA_WRITE_SHF32(output_port, stride, access_size, num_strides, mem_addr) \ + __asm__ __volatile__("ss_stride %0, %1, 0" : : "r"(stride), "r"(access_size)); \ + __asm__ __volatile__("ss_wr_dma %0, %1, %2" : : "r"(mem_addr), "r"(num_strides), "i"(output_port|0x80)); + + +// Scratch Oriented Instructions +// Plain Write to Scratch +#define SS_SCR_WRITE(output_port, num_bytes, scr_addr) \ + do { \ + SS_SCR_WR_INNER(scr_addr, num_bytes, output_port); \ + } while (false) + +// Do atomic stream update in scratchpad +#define SS_ATOMIC_SCR_OP(addr_port, val_port, offset, iters, opcode) \ + __asm__ __volatile__("ss_atom_op %0, %1, %2" : : "r"(offset), "r"(iters), "i"((addr_port<<7) | (val_port<<2) | opcode)); + + +// Send a constant value, repeated num_elements times to scratchpad +#define SS_CONST_SCR(scr_addr, val, num_elements) \ + __asm__ __volatile__("ss_set_iter %0 " : : "r"(num_elements)); \ + __asm__ __volatile__("ss_const_scr %0, %1, zero" : : "r"(scr_addr), "r"(val)); + +//Send a constant value, repeated num_elements times to a port +#define SS_CONST(port, val, num_elements) \ + __asm__ __volatile__("ss_const %0, %1, %2 " : : "r"(val), "r"(num_elements), "i"(port|(0<<8))); + +//Put a softbrain generated output value to a riscv core variable +#define SS_RECV(out_port, val) \ + __asm__ __volatile__("ss_recv %0, a0, %1 " : "=r"(val) : "i"(out_port)); + +//Send a constant value, repetated num_elements times to a port +// Plain Write to Scratch +#define SS_2D_CONST(port, val1, v1_repeat, val2, v2_repeat, iters) \ + __asm__ __volatile__("ss_set_iter %0 " : : "r"(iters)); \ + __asm__ __volatile__("ss_const %0, %1, %2 " : : "r"(val1), "r"(v1_repeat), "i"(port|(1<<7))); \ + __asm__ __volatile__("ss_const %0, %1, %2 " : : "r"(val2), "r"(v2_repeat), "i"(port|(1<<6))); + +//Send a constant value, repeated num_elements times to a port, with encoded +//const_width +#define SS_DCONST(port, val, num_elements, const_width) \ + __asm__ __volatile__("ss_const %0, %1, %2 " : : "r"(val), "r"(num_elements), "i"(port|const_width<<8)); + + +// This tells the port to repeat a certain number of times before consuming +// This is only really associated with the next command, as this information is forgotten as soon as +// a command is issued. +// Assuming stretch of size 10-bits (MSB represents if repeat_times is a port +// ot number +#define SS_CONFIG_PORT_EXPLICIT(repeat_times, stretch) \ + __asm__ __volatile__("ss_cfg_port %0, t0, %1" : : "r"(repeat_times), "i"((stretch) << 1)); + +#define SS_CONFIG_PORT(repeat_times, stretch) \ + do { \ + SS_CONFIG_PORT_EXPLICIT((repeat_times)*REPEAT_FXPNT_VAL, (stretch)*REPEAT_FXPNT_VAL) \ + } while(false) + +#define SS_REPEAT_PORT(times) \ + SS_CONFIG_PORT_EXPLICIT((times)*REPEAT_FXPNT_VAL, 0); + +// data-dependent repeat based on the data in a port +// only affine read dma->port, scr->port stream +// Assume the configuration same as the config of times port +#define SS_VREPEAT_PORT(times_port) \ + SS_CONFIG_PORT_EXPLICIT(times_port, 1); + +//Write from output to input port +#define SS_RECURRENCE(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, zero, %1" : : "r"(num_strides), "i"((input_port<<6) | (output_port))); + +//Write from output to input port +#define SS_RECURRENCE_PAD(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, %1, %2" : : "r"(num_strides), "r"(4), "i"((input_port<<6) | (output_port))); + +//Write from output to remote input port through the network (num_elem +//according to the port width) +#define SS_REM_PORT(output_port, num_elem, mask, remote_port) \ + __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_elem), "r"(mask), "i"(((output_port<15?output_port:output_port-32)<<7) | (0<<6) | (remote_port<<1) | (0))); + // __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_elem), "r"(mask), "i"((output_port<<7) | (0<<6) | (remote_port<<1) | (0))); + // __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_elem), "r"(mask), "i"((output_port<<6) | (remote_port))); + +//Write from output to remote scratchpad through the network (1 flag stands for spad) +#define SS_IND_REM_SCRATCH(val_port, addr_port, num_elem, scr_base_addr, scratch_type) \ + __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_elem), "r"(scr_base_addr), "i"((val_port<<7) | (scratch_type<<6) | (addr_port<<1) | (1))); + +#define SS_REM_SCRATCH(scr_base_addr, stride, access_size, num_strides, val_port, scratch_type) \ + __asm__ __volatile__("ss_stride %0, %1, 0" : : "r"(stride), "r"(access_size)); \ + __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_strides), "r"(scr_base_addr), "i"((val_port<<7) | (scratch_type<<6) | (0<<1) | (1))); + // __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_strides), "r"(scr_base_addr), "i"(((val_port<15?val_port:val_port-32)<<7) | (scratch_type<<6) | (0<<1) | (1))); + +// banked scratchpad: scr->port, port->remote scr +// TODO: remove scratch_type later (for now, I make the immediate negative) +#define SS_SCR_REM_SCR(src_scr_base_addr, stride, access_size, num_strides, dest_scr_base_addr, scratch_type) \ + SS_SCR_PORT_STREAM(src_scr_base_addr, stride, access_size, num_strides, SCR_SCR_PORT) \ + SS_REM_SCRATCH(dest_scr_base_addr, stride, access_size, num_strides, (SCR_SCR_PORT-32), scratch_type); + +#define SS_SCR_REM_PORT(scr_base_addr, num_strides, mask, remote_port) \ + SS_SCRATCH_READ(scr_base_addr, num_strides, SCR_REM_PORT) \ + SS_REM_PORT((SCR_REM_PORT-32), num_strides, mask, remote_port); + +// could be affine stream to banked scratchpad also +// #define SS_REM_SCRATCH(scr_base_addr, num_bytes, val_port, scratch_type) \ +// __asm__ __volatile("ss_rem_port %0, %1, %2" : : "r"(num_bytes), "r"(scr_base_addr), "i"((val_port<<7) | (scratch_type<<6) | (0<<1) | (1))); + +//Write from output to remote input port +//pos: local=0, left=1, right=2, undef=3 +//13th bit: disable-padding=0, enable-padding=1 +//(might be replaced later by some other RISCV instructions) +#define SS_XFER_LEFT(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, %1, %2" : : "r"(num_strides), "r"(1), "i"((input_port<<6) | (output_port))); +#define SS_XFER_RIGHT(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, %1, %2" : : "r"(num_strides), "r"(2), "i"((input_port<<6) | (output_port))) + +#define SS_XFER_LEFT_PAD(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, %1, %2" : : "r"(num_strides), "r"(1 | 4), "i"((input_port<<6) | (output_port))); +#define SS_XFER_RIGHT_PAD(output_port, input_port, num_strides) \ + __asm__ __volatile__("ss_wr_rd %0, %1, %2" : : "r"(num_strides), "r"(2 | 4), "i"((input_port<<6) | (output_port))) + + +// Datatype Encodings +#define T64 0 +#define T32 1 +#define T16 2 +#define T08 3 + +// currently output and data should be of same type +#define SS_CONFIG_ATOMIC_SCR_OP(addr_type, val_type, output_type) \ + __asm__ __volatile__("ss_cfg_atom_op t0, t0, %0" : : "i"( ((val_type<<4)&0x1ADB0 | (output_type<<2)&0x44C | (addr_type)&0x3))) + +//configure the type of indirection -- here multiplier has to be less than 2^7 +//Currently DTYPE MUST be 64 bits +#define SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,offset_list) \ + __asm__ __volatile__("ss_cfg_ind %0, %1, %2" : : "r"(offset_list), "r"(mult), "i"( (itype<<2) | (dtype<<0) ) ) + +#define SS_CONFIG_INDIRECT( itype,dtype,mult) SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,0) +#define SS_CONFIG_INDIRECT1(itype,dtype,mult,o1) SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,o1) +#define SS_CONFIG_INDIRECT2(itype,dtype,mult,o1,o2) SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,o1 | o2 << 8) +#define SS_CONFIG_INDIRECT3(itype,dtype,mult,o1,o2,o3) SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,o1 | o2 << 8 | o3 << 16) +#define SS_CONFIG_INDIRECT4(itype,dtype,mult,o1,o2,o3,o4) SS_CONFIG_INDIRECT_GENERAL(itype,dtype,mult,o1 | o2 << 8 | o3 << 16 | o4 << 24) + +//Write from output to input port (type -- 3:8-bit,2:16-bit,1:32-bit,0:64-bit) +#define SS_INDIRECT(ind_port, addr_offset, num_elem, input_port) \ + __asm__ __volatile__("ss_ind %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((input_port<<5) | (ind_port))) + +// generated streams are with base_addr = ind_port[i] (offset[col_ind], +// num_elem=num_elem_port[i] (offset[col_ind+1]-offset[col_ind], +// stride=sequential?) +#define SS_INDIRECT_2D(ind_port, addr_offset, num_elem, stride, access_size, num_elem_port, input_port) \ + __asm__ __volatile__("ss_stride %0, %1, %2" : : "r"(stride), "r"(access_size), "i"(num_elem_port | (1<<10))); \ + __asm__ __volatile__("ss_ind %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((input_port<<5) | (ind_port))); + // "i"((input_port<<5) | (ind_port) | (1<<9))); \ + +// This works for only linear scratchpad right now +#define SS_INDIRECT_SCR_2D(ind_port, addr_offset, num_elem, stride, access_size, num_elem_port, input_port) \ + __asm__ __volatile__("ss_stride %0, %1, %2" : : "r"(stride), "r"(access_size), "i"(num_elem_port | (1<<10))); \ + __asm__ __volatile__("ss_ind %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((1<<10) | (input_port<<5) | (ind_port))); + + +#define SS_INDIRECT_WR(ind_port, addr_offset, num_elem, output_port) \ + __asm__ __volatile__("ss_ind_wr %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((output_port<<5) | (ind_port))); + +//Write from output to input port (type -- 3:8-bit,2:16-bit,1:32-bit,0:64-bit) +#define SS_INDIRECT_SCR(ind_port, addr_offset, num_elem, input_port) \ + __asm__ __volatile__("ss_ind %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((1<<10) | (input_port<<5) | (ind_port))); + +#define SS_INDIRECT_WR_SCR(ind_port, addr_offset, num_elem, output_port) \ + __asm__ __volatile__("ss_ind_wr %0, %1, %2" : : "r"(addr_offset), "r"(num_elem),\ + "i"((1<<10) | (output_port<<5) | (ind_port))); + +//Wait on N number of remote scratchpad writes (num_bytes) +#define SS_WAIT_DF(num_rem_writes, scratch_type) \ + __asm __volatile__("ss_wait_df %0, %1" : : "r"(num_rem_writes), "i"(scratch_type)); + +//Wait with custom bit vector -- probably don't need to use +#define SS_WAIT(bit_vec) \ + __asm__ __volatile__("ss_wait t0, t0, " #bit_vec); \ + +//Wait for all softbrain commands and computations to be visible to memory from control core +#define SS_WAIT_ALL() \ + __asm__ __volatile__("ss_wait t0, t0, 0" : : : "memory"); \ + +//Wait for all prior scratch writes to be complete. +#define SS_WAIT_SCR_WR() \ + __asm__ __volatile__("ss_wait t0, t0, 1"); \ + +//wait for everything except outputs to be complete. (useful for debugging) +#define SS_WAIT_COMPUTE() \ + __asm__ __volatile__("ss_wait t0, t0, 2" : : : "memory"); \ + +//wait for all prior scratch reads to be complete +#define SS_WAIT_SCR_RD() \ + __asm__ __volatile__("ss_wait t0, t0, 4"); \ + +//wait for all prior scratch reads to be complete (NOT IMPLEMENTED IN SIMULTOR YET) +#define SS_WAIT_SCR_RD_QUEUED() \ + __asm__ __volatile__("ss_wait t0, t0, 8"); \ + +//wait for all prior scratch reads to be complete (NOT IMPLEMENTED IN SIMULTOR YET) +#define SS_WAIT_MEM_WR() \ + __asm__ __volatile__("ss_wait t0, t0, 16"); \ + +#define SS_WAIT_SCR_ATOMIC() \ + __asm__ __volatile__("ss_wait t0, t0, 32"); \ + + + +//Indirect Ports +#define P_IND_1 (31) +#define P_IND_2 (30) +#define P_IND_3 (29) +#define P_IND_4 (28) +//TODO: make indirect ports also 1-byte +#define P_IND_5 (27) +// #define P_IND_6 (26) + +//Convenience ports for these functions +#define MEM_SCR_PORT (23) +#define SCR_MEM_PORT (24) +#define SCR_SCR_PORT (25) +#define SCR_REM_PORT (26) + +// #define NET_ADDR_PORT (25) +// #define NET_VAL_PORT (32) + +#endif diff --git a/ss-scheduler/drivers/ss_sched.cpp b/ss-scheduler/drivers/ss_sched.cpp index f816799e0..481188db5 100644 --- a/ss-scheduler/drivers/ss_sched.cpp +++ b/ss-scheduler/drivers/ss_sched.cpp @@ -178,13 +178,14 @@ int main(int argc, char* argv[]) Schedule* sched=nullptr; //Scheduler scheduler(&ssmodel); + /* gams scheduler is not available in repo if(str_schedType == "gams") { auto* scheduler_gams = new GamsScheduler(&ssmodel); scheduler_gams->showGams(show_gams); scheduler_gams->setMipstart(mipstart); scheduler_gams->setSll(sll); scheduler = scheduler_gams; - } else if(str_schedType == "sa") { /*simulated annealing*/ + } else*/ if(str_schedType == "sa") { /*simulated annealing*/ scheduler = new SchedulerSimulatedAnnealing(&ssmodel); } else { cerr << "Something Went Wrong with Default Scheduler String"; diff --git a/ss-scheduler/src/scheduler/Makefile b/ss-scheduler/src/scheduler/Makefile index 71eb56b6a..69d604cbe 100644 --- a/ss-scheduler/src/scheduler/Makefile +++ b/ss-scheduler/src/scheduler/Makefile @@ -13,11 +13,8 @@ OPT ?= -O3 LEX = lex YACC = yacc -d -SOURCES= ssdfg.cpp schedule.cpp scheduler.cpp color_mapper.cpp scheduler_sa.cpp scheduler_gams.cpp serialize-schedule.cpp serialize-ssdfg.cpp -GAMS_DIR=gams_models -GAMS_FILES=$(wildcard gams_models/*.gms) -GAMS_INC=$(GAMS_FILES:.gms=.h) +SOURCES= ssdfg.cpp schedule.cpp scheduler.cpp color_mapper.cpp scheduler_sa.cpp serialize-schedule.cpp serialize-ssdfg.cpp INCLUDE_CONFIG=$(level)/src/config/ LIB_DEST=${build}/lib @@ -39,18 +36,11 @@ $(LIB_DEST)/libssscheduler.a: $(OBJECTS) $(LIB_DEST)/libssscheduler.so: $(OBJECTS) $(CXX) $(CXXFLAGS) -shared -o $@ $^ $(LIBS) -$(OBJ_DEST)/scheduler_gams.o: scheduler_gams.cpp $(DEPDIR)/scheduler_gams.d $(GAMS_INC) dfg-parser.tab.h - $(CXX) $(CXXFLAGS) -c -o $@ $< - @$(POSTCOMPILE) - $(OBJ_DEST)/%.o: %.cpp dfg-parser.tab.h $(DEPDIR)/%.d $(CXX) $(CXXFLAGS) -c -o $@ $< @$(POSTCOMPILE) -$(GAMS_DIR)/%.h: $(GAMS_DIR)/%.gms - xxd -i $< > $@ - include $(wildcard $(patsubst %,$(DEPDIR)/%.d,$(basename $(SOURCES)))) #-------- stuff to do with dfg-parser/lexer ----------- @@ -73,7 +63,7 @@ dfg-lex.yy.c: dfg-parser.l dfg-parser.tab.h .phony: clean clean: - -rm -Rf $(LIB_DEST)/*.so $(LIB_DEST)/*.a *.o $(OBJ_DEST)/*.o *.d $(OBJ_DEST)/*.d $(DEPDIR) $(GAMS_INC) dfg-parser.tab.c dfg-parser.tab.o dfg-lex.yy.c dfg-lex.yy.o *.tab.* dfg-parser *.output + -rm -Rf $(LIB_DEST)/*.so $(LIB_DEST)/*.a *.o $(OBJ_DEST)/*.o *.d $(OBJ_DEST)/*.d $(DEPDIR) dfg-parser.tab.c dfg-parser.tab.o dfg-lex.yy.c dfg-lex.yy.o *.tab.* dfg-parser *.output include ../../make.rules diff --git a/ss-workloads/dsp-benchmarks/centro-fir/gen.py b/ss-workloads/dsp-benchmarks/centro-fir/gen.py index 7612a3736..9cc4e5eae 100644 --- a/ss-workloads/dsp-benchmarks/centro-fir/gen.py +++ b/ss-workloads/dsp-benchmarks/centro-fir/gen.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys, numpy, random, imp -from math import sin, cos, pi +from math import sin, cos, pi output = imp.load_source('output', '../common/output.py') n = int(sys.argv[1]) @@ -10,7 +10,7 @@ if not (m % 2): exit() -numpy.set_printoptions(suppress = True, precision = 4., linewidth = 180, threshold = numpy.nan) +numpy.set_printoptions(suppress = True, precision = 4., linewidth = 180, threshold = sys.maxsize) a = numpy.random.rand(n).astype('complex64') + 1j * numpy.random.rand(n).astype('complex64') b = numpy.random.rand(m // 2).astype('complex64') + 1j * numpy.random.rand(m // 2).astype('complex64') diff --git a/ss-workloads/dsp-benchmarks/common/Makefile.inc b/ss-workloads/dsp-benchmarks/common/Makefile.inc index 5e36d2b56..b8b1703f1 100644 --- a/ss-workloads/dsp-benchmarks/common/Makefile.inc +++ b/ss-workloads/dsp-benchmarks/common/Makefile.inc @@ -22,7 +22,7 @@ DFG_HEADERS = $(DFG:.dfg=.dfg.h) $(DFG_HEADERS): %.dfg.h: %.dfg $(SS_TOOLS)/bin/ss_sched -a sa --max-iters 20000 -d $(FIFO_DEPTH) --verbose $(SBCONFIG) $< -IFLAGS=-I../../common/include -I../../../ss-tools/include -I../common/ -I$(SS_TOOLS)/include/ss-intrin +IFLAGS=-I../../common/include -I$(SS_TOOLS)/include -I../common/ -I$(SS_TOOLS)/include/ss-intrin INTELI=-I/opt/intel/include -I/opt/intel/mkl/include/ #CFLAGS=-O3 -std=c++17 CFLAGS=-Ofast -std=c++17 @@ -61,13 +61,13 @@ mkl.exe: mkl.cc ooo.log: optimized.exe input.data ref.data SBCONFIG=$(SBCONFIG) \ - gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=detailed --l1d_size=64kB --l1i_size=16kB --l2_size=1024kB --caches --l2cache --cmd=./$< \ + gem5.opt $(SS)/gem5/configs/example/se.py --cpu-type=detailed --l1d_size=64kB --l1i_size=16kB --l2_size=1024kB --caches --l2cache --cmd=./$< \ | tee $@ $(LOGS): %.log: %.exe input.data ref.data SBCONFIG=$(SBCONFIG) \ FU_FIFO_LEN=$(FIFO_DEPTH) \ - gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU \ + gem5.opt $(SS)/gem5/configs/example/se.py --cpu-type=MinorCPU \ --l1d_size=2048kB --l1d_assoc=32 --l1i_size=16kB --l2_size=1024kB --caches --cmd=./$< \ | tee $@ diff --git a/ss-workloads/dsp-benchmarks/common/output.py b/ss-workloads/dsp-benchmarks/common/output.py index c276b6f6d..bf63f9cf2 100644 --- a/ss-workloads/dsp-benchmarks/common/output.py +++ b/ss-workloads/dsp-benchmarks/common/output.py @@ -1,6 +1,7 @@ import numpy +import sys -numpy.set_printoptions(suppress = True, precision = 4., linewidth = 180, threshold = numpy.nan) +numpy.set_printoptions(suppress = True, precision = 4., linewidth = 180, threshold = sys.maxsize) def print_complex_array(filename, array): open(filename, 'w').writelines(['%f %f\n' % (i.real, i.imag) for i in array]) diff --git a/ss-workloads/dsp-benchmarks/qr/run.py b/ss-workloads/dsp-benchmarks/qr/run.py index c020ab42b..4dd6b1718 100755 --- a/ss-workloads/dsp-benchmarks/qr/run.py +++ b/ss-workloads/dsp-benchmarks/qr/run.py @@ -3,7 +3,7 @@ run = imp.load_source('run', '../tools/run.py') -run.run([12, 32], 'N=%d ', ['origin', 'new', 'latency'], 'qr.res') +run.run([12, 32], 'N=%d ', ['origin', 'new'], 'qr.res') SS = os.getenv('SS') run.run([12, 32], 'SBCONFIG=%s/ss-scheduler/configs/revel-1x2.sbmodel N=%s ' % (SS, '%d'), ['new'], 'qr.res') diff --git a/ss-workloads/test-multi/run-tests.sh b/ss-workloads/test-multi/run-tests.sh index caf1af4b6..9ad6b9c03 100755 --- a/ss-workloads/test-multi/run-tests.sh +++ b/ss-workloads/test-multi/run-tests.sh @@ -2,7 +2,7 @@ export SBCONFIG=$SS_TOOLS/configs/revel-1x2.sbmodel -#export LD_LIBRARY_PATH=~/ss-stack/ss_tools/lib +#export LD_LIBRARY_PATH=$SS/ss_tools/lib > fail_list @@ -12,9 +12,9 @@ export SBCONFIG=$SS_TOOLS/configs/revel-1x2.sbmodel function run_test { test=$1 - BACKCGRA=1 LINEAR_SCR=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test - # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test - # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --ruby --num-cpus=16 --num-dirs=16 --network=simple --topology=Mesh_XY --mesh-rows=2 --cmd=$test + BACKCGRA=1 LINEAR_SCR=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test + # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test + # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --ruby --num-cpus=16 --num-dirs=16 --network=simple --topology=Mesh_XY --mesh-rows=2 --cmd=$test ret_val=$? { diff --git a/ss-workloads/test-single/run-tests.sh b/ss-workloads/test-single/run-tests.sh index db35df9ee..d0b2281a7 100755 --- a/ss-workloads/test-single/run-tests.sh +++ b/ss-workloads/test-single/run-tests.sh @@ -11,9 +11,9 @@ export SBCONFIG=$SS_TOOLS/configs/revel-1x2.sbmodel function run_test { test=$1 - BACKCGRA=1 LINEAR_SCR=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test - # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test - # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt ~/ss-stack/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --ruby --num-cpus=16 --num-dirs=16 --network=simple --topology=Mesh_XY --mesh-rows=2 --cmd=$test + BACKCGRA=1 LINEAR_SCR=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test + # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --cmd=$test + # BACKCGRA=1 SUPRESS_STATS=1 timeout 10 gem5.opt $SS/gem5/configs/example/se.py --cpu-type=MinorCPU --l1d_size=64kB --l1i_size=16kB --caches --ruby --num-cpus=16 --num-dirs=16 --network=simple --topology=Mesh_XY --mesh-rows=2 --cmd=$test ret_val=$? {