27 if (jlm::rvsdg::is<llvm::PointerType>(*
type))
40 throw std::logic_error(
type->debug_string() +
" not implemented!");
62 std::optional<std::string>
70 const auto &
type = results.front();
86 std::tuple<size_t, std::string, std::string>
89 size_t argument_index = 0;
90 std::ostringstream parameters;
91 std::ostringstream arguments;
97 if (rvsdg::is<BundleType>(argType))
100 if (argument_index != 0)
106 parameters <<
ConvertToCType(argType.get()) <<
" a" << argument_index;
107 arguments <<
"a" << argument_index;
111 return std::make_tuple(argument_index, parameters.str(), arguments.str());
117 std::ostringstream cpp;
119 const auto & function_name =
125 JLM_ASSERT(mem_reqs.size() == mem_resps.size());
135 #define TRACE_CHUNK_SIZE 100000
136 #define TIMEOUT 10000000
138 #ifndef MEMORY_LATENCY
139 #define MEMORY_LATENCY )"
153 #include <verilated.h>
155 #include "verilated_fst_c.h"
157 #include "verilated_vcd_c.h"
159 #define xstr(s) str(s)
167 // ======== Global variables used for simulating the model ========
168 // The verilated model being simulated
171 // Current simulation time, in number of cycles
172 uint64_t main_time = 0;
174 // Can be set from signal handlers, to trigger gracefull early termination
175 bool terminate = false;
178 // ======== Global variables imported from other modules ========
183 const auto graphImport = util::assertedCast<llvm::LlvmGraphImport>(arg);
184 cpp <<
"extern \"C\" char " << graphImport->Name() <<
";" << std::endl;
188 // ======== Tracing accesses to main memory ==========
192 uint8_t width; // 2^width bytes
197 bool operator==(const mem_access & other) const {
198 return addr == other.addr && write == other.write && width == other.width && !memcmp(data, other.data, 1<<width);
202 // A log of memory accesses made by the kernel
203 std::vector<mem_access> memory_accesses;
204 // Accesses to regions in this vector of (start, length) pairs are not traced
205 std::vector<std::pair<void*, size_t>> ignored_memory_regions;
207 static void ignore_memory_region(void* start, size_t length) {
208 ignored_memory_regions.emplace_back(start, length);
211 static bool in_ignored_region(void* addr) {
212 for (auto [start, length] : ignored_memory_regions) {
213 if (addr >= start && addr < (char*)start + length)
219 static void* instrumented_load(void* addr, uint8_t width, uint32_t port=0) {
220 void * data = malloc(1 << width);
221 memcpy(data, addr, 1 << width);
222 if (!in_ignored_region(addr))
223 memory_accesses.push_back({addr, false, width, data, port, main_time});
227 static void instrumented_store(void* addr, void *data, uint8_t width, uint32_t port=0) {
228 void * data_copy = malloc(1 << width);
229 memcpy(data_copy, data, 1 << width);
230 memcpy(addr, data_copy, 1 << width);
231 if(!in_ignored_region(addr))
232 memory_accesses.push_back({addr, true, width, data_copy, port, main_time});
235 uint32_t dummy_data[16] = {
253 // ======== Implementation of external memory queues, adding latency to loads ========
256 uint64_t request_time;
264 std::deque<Response> responses;
267 MemoryQueue(int latency, int width, int port) : latency(latency), width(width), port(port) {}
269 // Called right before posedge, can only read from the model
270 void accept_request(uint8_t req_ready, uint8_t req_valid, uint8_t req_write, uint64_t req_addr, uint8_t req_size, void* req_data, uint8_t req_id, uint8_t res_valid, uint8_t res_ready) {
276 // If a response was consumed this cycle, remove it
277 if (res_ready && res_valid) {
278 assert(!responses.empty());
279 responses.pop_front();
282 if (!req_ready || !req_valid)
286 // Stores are performed immediately
287 instrumented_store((void*) req_addr, req_data, req_size, port);
288 responses.push_back({main_time, req_data, req_size, req_id});
290 // Loads are performed immediately, but their response is placed in the queue
291 void* data = instrumented_load((void*) req_addr, req_size, port);
292 responses.push_back({main_time, data, req_size, req_id});
296 // Called right after posedge, can only write to the model
297 void produce_response(uint8_t& req_ready, uint8_t& res_valid, void* res_data, uint8_t& res_id) {
298 if (!responses.empty() && responses.front().request_time + latency <= main_time + 1) {
300 memcpy(res_data, responses.front().data, 1<<responses.front().size);
301 res_id = responses.front().id;
304 memcpy(res_data, dummy_data, width);
308 // Always ready for requests
313 return responses.empty();
318 cpp << "MemoryQueue memory_queues[] = {";
319 for (
size_t i = 0; i < mem_reqs.size(); i++)
321 auto bundle =
dynamic_cast<const BundleType *
>(mem_resps[i]->Type().get());
322 auto size =
JlmSize(&*bundle->get_element_type(
"data")) / 8;
324 cpp <<
"{MEMORY_LATENCY, " << size <<
", " << i <<
"}, ";
328 // ======== Variables and functions for tracing the verilated model ========
337 static void init_tracing() {
340 tfp = new VerilatedFstC;
341 top->trace(tfp, 99); // Trace 99 levels of hierarchy
342 tfp->open(xstr(V_NAME) ".fst");
344 tfp = new VerilatedVcdC;
345 top->trace(tfp, 99); // Trace 99 levels of hierarchy
346 tfp->open(xstr(V_NAME) ".vcd");
351 // Saves the current state of all wires and registers at the given timestep
352 static void capture_trace(uint64_t time) {
361 static void finish_trace() {
362 // Coverage analysis (since test passed)
364 Verilated::mkdir("logs");
365 VerilatedCov::write("logs/coverage.dat");
372 // ======== Setup and execution of the verilated model ========
373 static void posedge();
374 static void negedge();
375 static void verilator_finish();
377 // Called by $time in Verilog. Converts to real, to match SystemC
378 double sc_time_stamp() {
382 // Called once to initialize the verilated model
383 static void verilator_init(int argc, char **argv) {
384 // set up signaling so we can kill the program and still get waveforms
385 struct sigaction action;
386 memset(&action, 0, sizeof(struct sigaction));
387 action.sa_handler = [](int sig){ terminate = true; };
388 sigaction(SIGTERM, &action, NULL);
389 sigaction(SIGKILL, &action, NULL);
390 sigaction(SIGINT, &action, NULL);
392 atexit(verilator_finish);
394 // Set debug level, 0 is off, 9 is highest presently used
395 // May be overridden by commandArgs
398 // Randomization reset policy
399 // May be overridden by commandArgs
400 Verilated::randReset(2);
402 // Verilator must compute traced signals
403 Verilated::traceEverOn(true);
405 // Pass arguments so Verilated code can see them, e.g., $value$plusargs
406 // This needs to be called before you create any model
407 Verilated::commandArgs(argc, argv);
409 // Construct the Verilated model
422 size_t first_ctx_var = reg_args.size() - kernel.GetContextVars().size();
423 for (
size_t i = 0; i < first_ctx_var; i++)
428 cpp <<
" top->i_data_" << i <<
" = 0;" << std::endl;
430 for (
const auto & ctx : kernel.GetContextVars())
433 const auto import = util::assertedCast<rvsdg::GraphImport>(ctx.input->origin());
434 cpp <<
" top->i_data_" << first_ctx_var <<
" = (uint64_t) &" <<
import->Name() <<
";"
440 // Run some cycles with reset set HIGH
456 // Model outputs should be read right before posedge()
457 // Model inputs should be set right after posedge()
458 static void posedge() {
460 std::cout << "terminating\n";
463 assert(!Verilated::gotFinish());
464 assert(top->clk == 0);
466 // Read memory requests just before the rising edge
470 for (
size_t i = 0; i < mem_reqs.size(); i++)
472 const auto req_bt = util::assertedCast<const BundleType>(mem_reqs[i]->
Type().get());
473 const auto has_write = req_bt->get_element_type(
"write") !=
nullptr;
475 cpp <<
" memory_queues[" << i <<
"].accept_request(";
476 cpp <<
"top->mem_" << i <<
"_req_ready, ";
477 cpp <<
"top->mem_" << i <<
"_req_valid, ";
479 cpp <<
" top->mem_" << i <<
"_req_data_write, ";
482 cpp <<
"top->mem_" << i <<
"_req_data_addr, ";
483 cpp <<
"top->mem_" << i <<
"_req_data_size, ";
485 cpp <<
"&top->mem_" << i <<
"_req_data_data, ";
488 cpp <<
"top->mem_" << i <<
"_req_data_id, ";
489 cpp <<
"top->mem_" << i <<
"_res_ready, ";
490 cpp <<
"top->mem_" << i <<
"_res_valid);" << std::endl;
496 // Capturing the posedge trace here would make external inputs appear on negedge
497 // capture_trace(main_time * 2);
500 static void negedge() {
501 assert(!Verilated::gotFinish());
502 assert(top->clk == 1);
504 // Memory responses are ready before the negedge
508 for (
size_t i = 0; i < mem_reqs.size(); i++)
510 cpp <<
" memory_queues[" << i <<
"].produce_response(";
511 cpp <<
"top->mem_" << i <<
"_req_ready, ";
512 cpp <<
"top->mem_" << i <<
"_res_valid, ";
513 cpp <<
"&top->mem_" << i <<
"_res_data_data, ";
514 cpp <<
"top->mem_" << i <<
"_res_data_id);" << std::endl;
520 // Capturing the posedge trace here makes external inputs appear to update with the posedge
521 capture_trace(main_time * 2);
525 capture_trace(main_time * 2 + 1);
529 static void verilator_finish() {
538 << c_return_type.value_or("void") <<
" run_hls(" << std::endl;
539 cpp << c_params << R
"(
542 verilator_init(0, NULL);
544 int start = main_time;
546 // Run cycles until i_ready becomes HIGH
547 for (int i = 0; i < TIMEOUT && !top->i_ready; i++) {
552 std::cout << "i_ready was not set within TIMEOUT" << std::endl;
558 // Pass in input data for one cycle
562 for (
size_t i = 0; i < num_c_params; i++)
565 kernel.GetOperation().type().Arguments()[i].get()))
568 cpp <<
"top->i_data_" << i <<
" = *(uint32_t*) &a" << i <<
";" << std::endl;
570 cpp <<
"top->i_data_" << i <<
" = *(uint64_t*) &a" << i <<
";" << std::endl;
574 cpp <<
"top->i_data_" << i <<
" = (uint64_t) a" << i <<
";" << std::endl;
587 for (
size_t i = 0; i < num_c_params; i++)
589 cpp <<
"top->i_data_" << i <<
" = 0;" << std::endl;
595 // Cycle until o_valid becomes HIGH
596 for (int i = 0; i < TIMEOUT && !top->o_valid; i++) {
601 std::cout << "o_valid was not set within TIMEOUT" << std::endl;
605 std::cout << "finished - took " << (main_time - start) << " cycles" << std::endl;
607 // Ensure all memory queues are empty
609 for (
size_t i = 0; i < mem_reqs.size(); i++)
610 cpp <<
"assert(memory_queues[" << i <<
"].empty());" << std::endl;
612 if (c_return_type.has_value())
613 cpp <<
"return *(" << c_return_type.value() <<
"*)&top->o_data_0;" << std::endl;
619 // ======== Running the kernel compiled as C, with intrumentation ========
621 << c_return_type.value_or("void") <<
" instrumented_ref(" << c_params <<
");" << R
"(
623 extern "C" void reference_load(void* addr, uint64_t width) {
624 instrumented_load(addr, width);
627 extern "C" void reference_store(void* addr, uint64_t width) {
628 instrumented_store(addr, addr, width);
631 extern "C" void reference_alloca(void* start, uint64_t length) {
632 ignore_memory_region(start, length);
635 std::vector<mem_access> ref_memory_accesses;
637 // Calls instrumented_ref in a forked process and stores its memory accesses
642 int fd[2]; // channel 0 for reading and 1 for writing
643 size_t tmp = pipe(fd);
645 if(pid == 0) { // child
646 close(fd[0]); // close fd[0] since child will only write
649 << c_call_args << R"();
651 // Send all memory accesses to the parent
652 size_t cnt = memory_accesses.size();
653 tmp = write(fd[1], &cnt, sizeof(size_t));
654 for (auto & access : memory_accesses){
655 tmp = write(fd[1], &access, sizeof(mem_access));
656 tmp = write(fd[1], access.data, 1<< access.width);
662 close(fd[1]); // close fd[1] since parent will only read
664 // Retrieve all memory_accesses from the child
666 tmp = read(fd[0], &cnt, sizeof(size_t));
667 ref_memory_accesses.resize(cnt);
668 for (auto & access : ref_memory_accesses) {
669 tmp = read(fd[0], &access, sizeof(mem_access));
670 access.data = malloc(1 << access.width);
671 tmp = read(fd[0], access.data, 1 << access.width);
678 // Checks that memory_accesses and ref_memory_accesses are identical within each address
679 static void compare_memory_accesses() {
680 assert (memory_accesses.size() == ref_memory_accesses.size());
682 // Stable sort the memory accesses by only address, keeping order within each address.
683 auto addr_sort = [](const mem_access & a, const mem_access & b) {
684 return a.addr < b.addr;
686 std::stable_sort(memory_accesses.begin(), memory_accesses.end(), addr_sort);
687 std::stable_sort(ref_memory_accesses.begin(), ref_memory_accesses.end(), addr_sort);
688 assert(memory_accesses == ref_memory_accesses);
691 static void empty_mem_acces_vector(std::vector<mem_access> &vec){
695 vec.erase(vec.begin(), vec.end());
698 // ======== Entry point for calling kernel from host device (C code) ========
700 << c_return_type.value_or("void") <<
" " << function_name <<
"(" << c_params <<
")" << R
"(
702 // Execute instrumented version of kernel compiled for the host in a fork
704 << c_call_args << R"();
706 // Execute the verilated model in this process
708 if (c_return_type.has_value())
709 cpp <<
"auto result = ";
710 cpp <<
"run_hls(" << c_call_args <<
");" << std::endl;
713 // Compare traced memory accesses
714 compare_memory_accesses();
716 // Reset structures used for tracing memory operations
717 empty_mem_acces_vector(memory_accesses);
718 empty_mem_acces_vector(ref_memory_accesses);
719 ignored_memory_regions.clear();
722 if (c_return_type.has_value())
723 cpp <<
" return result;" << std::endl;
725 cpp <<
"}" << std::endl;
std::vector< rvsdg::RegionArgument * > get_reg_args(const rvsdg::LambdaNode &lambda)
std::vector< rvsdg::RegionResult * > get_mem_reqs(const rvsdg::LambdaNode &lambda)
static int JlmSize(const jlm::rvsdg::Type *type)
const rvsdg::LambdaNode * get_hls_lambda(llvm::LlvmRvsdgModule &rm)
std::vector< rvsdg::RegionArgument * > get_mem_resps(const rvsdg::LambdaNode &lambda)
const util::FilePath VerilogFile_
std::string GetText(llvm::LlvmRvsdgModule &rm) override
const std::vector< std::shared_ptr< const jlm::rvsdg::Type > > & Arguments() const noexcept
const std::vector< std::shared_ptr< const jlm::rvsdg::Type > > & Results() const noexcept
Region & GetRootRegion() const noexcept
LambdaOperation & GetOperation() const noexcept override
const FunctionType & type() const noexcept
RegionArgumentRange Arguments() noexcept
std::string base() const noexcept
Returns the base name of the file without the path.
#define JLM_UNREACHABLE(msg)
int JlmSize(const jlm::rvsdg::Type *type)
std::tuple< size_t, std::string, std::string > GetParameterListAsC(const rvsdg::LambdaNode &kernel)
static constexpr int MEMORY_RESPONSE_LATENCY
std::optional< std::string > GetReturnTypeAsC(const rvsdg::LambdaNode &kernel)
std::string ConvertToCType(const rvsdg::Type *type)
static std::string type(const Node *n)
@ State
Designate a state type.
static std::string strfmt(Args... args)