Jlm
add-buffers.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 David Metz <david.c.metz@ntnu.no>
3  * See COPYING for terms of redistribution.
4  */
5 
9 #include <jlm/hls/ir/hls.hpp>
10 #include <jlm/hls/util/view.hpp>
11 #include <jlm/rvsdg/traverser.hpp>
12 
13 namespace jlm::hls
14 {
15 
16 static rvsdg::Input *
18 {
19 
20  auto user = &out->SingleUser();
21  if (auto br = dynamic_cast<BackEdgeResult *>(user))
22  {
23  return FindUserNode(br->argument());
24  }
25  else if (auto rr = dynamic_cast<rvsdg::RegionResult *>(user))
26  {
27 
28  if (rr->output() && rvsdg::TryGetOwnerNode<LoopNode>(*rr->output()))
29  {
30  return FindUserNode(rr->output());
31  }
32  else
33  {
34  // lambda result
35  return rr;
36  }
37  }
38  else if (auto si = dynamic_cast<rvsdg::StructuralInput *>(user))
39  {
40  JLM_ASSERT(rvsdg::TryGetOwnerNode<LoopNode>(*user));
41  return FindUserNode(si->arguments.begin().ptr());
42  }
43  else if (rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*user))
44  {
45  return user;
46  }
47 
48  JLM_UNREACHABLE("This should not have happened!");
49 }
50 
51 static void
52 PlaceBuffer(rvsdg::Output * out, size_t capacity, bool passThrough)
53 {
54  // places or re-places a buffer on an output
55  auto user = FindUserNode(out);
56  // don't place buffers after constants
57  if (is_constant(rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*out)))
58  {
59  return;
60  }
61  auto [forkNode, forkOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<ForkOperation>(*out);
62  if (forkOperation && forkOperation->IsConstant())
63  {
64  return;
65  }
66 
67  // TODO: handle out being a buf?
68  auto [bufferNode, bufferOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<BufferOperation>(*user);
69  if (bufferOperation
70  && (bufferOperation->IsPassThrough() != passThrough
71  || bufferOperation->Capacity() != capacity))
72  {
73  // replace buffer and keep larger size
74  auto node = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*user);
75  passThrough = passThrough && bufferOperation->IsPassThrough();
76  capacity = std::max(capacity, bufferOperation->Capacity());
77  auto bufOut = BufferOperation::create(*node->input(0)->origin(), capacity, passThrough)[0];
78  node->output(0)->divert_users(bufOut);
79  JLM_ASSERT(node->IsDead());
80  remove(node);
81  }
82  else
83  {
84  // create new buffer
85  auto & directUser = *out->Users().begin();
86  auto newOut = BufferOperation::create(*out, capacity, passThrough)[0];
87  directUser.divert_to(newOut);
88  }
89 }
90 
91 static void
93 {
94  auto addrq = dynamic_cast<const AddressQueueOperation *>(&node->GetOperation());
95  JLM_ASSERT(addrq);
96  // place buffer on addr output
97  PlaceBuffer(node->output(0), addrq->capacity, true);
98 }
99 
100 static void
102 {
103  auto buf = dynamic_cast<const BufferOperation *>(&node->GetOperation());
104  JLM_ASSERT(buf);
105  auto user = FindUserNode(node->output(0));
106  auto [bufferNode, bufferOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<BufferOperation>(*user);
107  if (bufferOperation)
108  {
109  auto node2 = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*user);
110  // merge buffers and keep larger size
111  bool passThrough = buf->IsPassThrough() && bufferOperation->IsPassThrough();
112  auto capacity = std::max(buf->Capacity(), bufferOperation->Capacity());
113  auto newOut = BufferOperation::create(*node->input(0)->origin(), capacity, passThrough)[0];
114  JLM_ASSERT(node2->region() == newOut->region());
115  node2->output(0)->divert_users(newOut);
116  JLM_ASSERT(node2->IsDead());
117  remove(node2);
118  JLM_ASSERT(node->IsDead());
119  remove(node);
120  }
121 }
122 
123 static void
125 {
126  // TODO: should this be changed?
127  bool outerLoop = !rvsdg::is<LoopOperation>(loopNode->region()->node());
128  if (outerLoop)
129  {
130  // push buffers above branches, so they also act as output buffers
131  for (size_t i = 0; i < loopNode->noutputs(); ++i)
132  {
133  auto out = loopNode->output(i);
134  auto res = out->results.begin().ptr();
135  auto [branchNode, branchOperation] =
136  rvsdg::TryGetSimpleNodeAndOptionalOp<BranchOperation>(*res->origin());
137  if (!branchOperation)
138  {
139  // this is a memory operation or stream
140  continue;
141  }
142  JLM_ASSERT(branchOperation->loop);
143  auto oldBufInput = &branchNode->output(1)->SingleUser();
144  auto [oldBufferNode, oldBufferOperation] =
145  rvsdg::TryGetSimpleNodeAndOptionalOp<BufferOperation>(*oldBufInput);
146  if (rvsdg::IsOwnerNodeOperation<SinkOperation>(*oldBufInput))
147  {
148  // no backedge
149  continue;
150  }
151  JLM_ASSERT(oldBufferOperation);
152  auto oldBufNode = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*oldBufInput);
153  // place new buffers
154  PlaceBuffer(
155  branchNode->input(1)->origin(),
156  oldBufferOperation->Capacity(),
157  oldBufferOperation->IsPassThrough());
158  // this buffer should just make the fork buf non-passthrough - needed to avoid combinatorial
159  // cycle
160  PlaceBuffer(
161  branchNode->input(0)->origin(),
162  oldBufferOperation->Capacity(),
163  oldBufferOperation->IsPassThrough());
164  // remove old buffer
165  oldBufNode->output(0)->divert_users(oldBufInput->origin());
166  JLM_ASSERT(oldBufNode->IsDead());
167  remove(oldBufNode);
168  }
169  }
170  else
171  {
172  // add input buffers
173  for (size_t i = 0; i < loopNode->ninputs(); ++i)
174  {
175  auto in = loopNode->input(i);
176  auto arg = in->arguments.begin().ptr();
177  auto user = &arg->SingleUser();
178  // only do this for proper loop variables
179  if (auto [node, muxOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<MuxOperation>(*user);
180  muxOperation)
181  {
182  if (!muxOperation->loop)
183  {
184  // stream
185  continue;
186  }
187  }
188  else if (rvsdg::IsOwnerNodeOperation<LoopConstantBufferOperation>(*user))
189  {
190  }
191  else
192  {
193  continue;
194  }
195  PlaceBuffer(in->origin(), 2, false);
196  }
197  }
198 }
199 
200 static void
202 {
203  for (auto & node : rvsdg::TopDownTraverser(region))
204  {
205  if (auto structnode = dynamic_cast<rvsdg::StructuralNode *>(node))
206  {
207  auto loop = dynamic_cast<LoopNode *>(node);
208  JLM_ASSERT(loop);
209  OptimizeLoop(loop);
210  for (size_t n = 0; n < structnode->nsubregions(); n++)
211  {
212  AddBuffers(structnode->subregion(n));
213  }
214  }
215  else if (auto simple = dynamic_cast<jlm::rvsdg::SimpleNode *>(node))
216  {
217  if (jlm::rvsdg::is<BufferOperation>(node))
218  {
219  OptimizeBuffer(simple);
220  }
221  else if (jlm::rvsdg::is<ForkOperation>(node))
222  {
223  // OptimizeFork(simple);
224  }
225  else if (jlm::rvsdg::is<BranchOperation>(node))
226  {
227  // OptimizeBranch(simple);
228  }
229  else if (jlm::rvsdg::is<StateGateOperation>(node))
230  {
231  // OptimizeStateGate(simple);
232  }
233  else if (jlm::rvsdg::is<AddressQueueOperation>(node))
234  {
235  OptimizeAddrQ(simple);
236  }
237  }
238  }
239 }
240 
241 static size_t MemoryLatency = 10;
242 
243 static constexpr uint32_t
244 round_up_pow2(uint32_t x)
245 {
246  if (x == 0)
247  return 1;
248  --x;
249  x |= x >> 1;
250  x |= x >> 2;
251  x |= x >> 4;
252  x |= x >> 8;
253  x |= x >> 16;
254  return x + 1;
255 }
256 
257 static void
259 {
260  // const size_t capacity = 256;
261  std::vector<jlm::rvsdg::SimpleNode *> nodes;
262  for (auto & node : rvsdg::TopDownTraverser(region))
263  {
264  if (auto structnode = dynamic_cast<rvsdg::StructuralNode *>(node))
265  {
266  auto loop = dynamic_cast<LoopNode *>(node);
267  JLM_ASSERT(loop);
268  for (size_t n = 0; n < structnode->nsubregions(); n++)
269  {
270  MaximizeBuffers(structnode->subregion(n));
271  }
272  }
273  else if (auto sn = dynamic_cast<jlm::rvsdg::SimpleNode *>(node))
274  {
275  if (rvsdg::is<BufferOperation>(node))
276  {
277  nodes.push_back(sn);
278  }
279  else if (rvsdg::is<DecoupledLoadOperation>(node))
280  {
281  nodes.push_back(sn);
282  }
283  }
284  }
285  for (auto node : nodes)
286  {
287  if (auto dl = dynamic_cast<const DecoupledLoadOperation *>(&node->GetOperation()))
288  {
289  auto capacity = round_up_pow2(MemoryLatency);
290  if (dl->capacity < capacity)
291  {
292  divert_users(
293  node,
295  *node->input(0)->origin(),
296  *node->input(1)->origin(),
297  capacity));
298  remove(node);
299  }
300  }
301  }
302 }
303 
304 static std::vector<size_t>
305 NodeCycles(rvsdg::SimpleNode * node, std::vector<size_t> & input_cycles)
306 {
307  auto max_cycles = *std::max_element(input_cycles.begin(), input_cycles.end());
308  if (auto op = dynamic_cast<const llvm::FBinaryOperation *>(&node->GetOperation()))
309  {
310  if (op->fpop() == llvm::fpop::add)
311  {
312  return { max_cycles + 1 };
313  }
314  }
315  else if (auto op = dynamic_cast<const BufferOperation *>(&node->GetOperation()))
316  {
317  if (op->IsPassThrough())
318  {
319  return { max_cycles + 0 };
320  }
321  return { max_cycles + 1 };
322  }
323  else if (dynamic_cast<const AddressQueueOperation *>(&node->GetOperation()))
324  {
325  return { input_cycles[0] };
326  }
327  else if (rvsdg::is<DecoupledLoadOperation>(node))
328  {
329  return { max_cycles + MemoryLatency, 0 };
330  }
331  else if (rvsdg::is<StateGateOperation>(node))
332  {
333  // handle special state gate that sits on dec_load response
334  auto sg0_user = &node->output(0)->SingleUser();
335  if (rvsdg::IsOwnerNodeOperation<DecoupledLoadOperation>(*sg0_user) && sg0_user->index() == 1)
336  {
337  JLM_ASSERT(max_cycles == 0);
338  return { 0, MemoryLatency };
339  }
340  }
341  else if (rvsdg::is<StoreOperation>(node))
342  {
343  JLM_ASSERT(node->noutputs() == 3);
344  return { max_cycles + MemoryLatency, 0, 0 };
345  }
346  return std::vector<size_t>(node->noutputs(), max_cycles);
347 }
348 
349 const size_t UnlimitedBufferCapacity = std::numeric_limits<uint32_t>::max();
350 
351 static std::vector<size_t>
352 NodeCapacity(rvsdg::SimpleNode * node, std::vector<size_t> & input_capacities)
353 {
354  auto min_capacity = *std::min_element(input_capacities.begin(), input_capacities.end());
355  if (auto op = dynamic_cast<const llvm::FBinaryOperation *>(&node->GetOperation()))
356  {
357  if (op->fpop() == llvm::fpop::add)
358  {
359  return { min_capacity + 1 };
360  }
361  }
362  else if (auto op = dynamic_cast<const BufferOperation *>(&node->GetOperation()))
363  {
364  return { min_capacity + op->Capacity() };
365  }
366  else if (dynamic_cast<const AddressQueueOperation *>(&node->GetOperation()))
367  {
368  return { input_capacities[0] };
369  }
370  else if (auto op = dynamic_cast<const DecoupledLoadOperation *>(&node->GetOperation()))
371  {
372  return { min_capacity + op->capacity, 0 };
373  }
374  else if (rvsdg::is<StateGateOperation>(node))
375  {
376  // handle special state gate that sits on dec_load response
377  auto sg0_user = &node->output(0)->SingleUser();
378  if (rvsdg::IsOwnerNodeOperation<DecoupledLoadOperation>(*sg0_user) && sg0_user->index() == 1)
379  {
380  JLM_ASSERT(min_capacity == UnlimitedBufferCapacity);
381  return { 0, MemoryLatency };
382  }
383  }
384  else if (rvsdg::is<StoreOperation>(node))
385  {
386  return { min_capacity + MemoryLatency, 0, 0 };
387  }
388  return std::vector<size_t>(node->noutputs(), min_capacity);
389 }
390 
391 static void
393  const LoopNode * loop,
394  std::unordered_map<rvsdg::Output *, size_t> & output_cycles,
395  std::unordered_set<rvsdg::Input *> & frontier,
396  std::unordered_set<BackEdgeResult *> & stream_backedges,
397  std::unordered_set<rvsdg::SimpleNode *> & top_muxes)
398 {
399  for (size_t i = 0; i < loop->ninputs(); ++i)
400  {
401  auto in = loop->input(i);
402  auto arg = in->arguments.begin().ptr();
403 
404  auto user = &arg->SingleUser();
405  auto userNode = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*user);
406  auto [muxNode, muxOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<MuxOperation>(*user);
407  if (rvsdg::IsOwnerNodeOperation<LoopConstantBufferOperation>(*user)
408  || (muxOperation && muxOperation->loop))
409  {
410  top_muxes.insert(userNode);
411  // we start from these
412  auto out = userNode->output(0);
413  output_cycles[out] = output_cycles[in->origin()];
414  frontier.insert(&out->SingleUser());
415  }
416  // these are needed so we can finish with an empty frontier
417  output_cycles[arg] = output_cycles[in->origin()];
418  frontier.insert(&arg->SingleUser());
419  }
420  for (auto & tn : loop->subregion()->TopNodes())
421  {
422  JLM_ASSERT(is_constant(&tn));
423  auto out = tn.output(0);
424  output_cycles[out] = 0;
425  frontier.insert(&out->SingleUser());
426  }
427  for (auto arg : loop->subregion()->Arguments())
428  {
429  auto backedge = dynamic_cast<BackEdgeArgument *>(arg);
430  if (!backedge)
431  {
432  continue;
433  }
434  auto user = &arg->SingleUser();
435  auto [muxNode, muxOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<MuxOperation>(*user);
436  if ((muxOperation && muxOperation->loop))
437  {
438  continue;
439  }
440  if (rvsdg::IsOwnerNodeOperation<BufferOperation>(*user))
441  {
442  auto bufNode = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*user);
443  if (rvsdg::IsOwnerNodeOperation<PredicateBufferOperation>(bufNode->output(0)->SingleUser()))
444  {
445  // skip predicate buffer
446  continue;
447  }
448  }
449  // this comes from somewhere inside the loop
450  output_cycles[arg] = 0;
451  frontier.insert(&arg->SingleUser());
452  stream_backedges.insert(backedge->result());
453  }
454 }
455 
456 static void
458  LoopNode * loop,
459  std::unordered_map<rvsdg::Output *, size_t> & output_cycles,
460  bool analyze_inner_loop = false);
461 
462 static void
464  std::unordered_map<rvsdg::Output *, size_t> & output_cycles,
465  std::unordered_set<rvsdg::Input *> & frontier,
466  std::unordered_set<BackEdgeResult *> & stream_backedges,
467  std::unordered_set<rvsdg::SimpleNode *> & top_muxes)
468 {
469  bool changed = false;
470  do
471  {
472  changed = false;
473  for (auto in : frontier)
474  {
475  if (auto simpleNode = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*in))
476  {
477  bool all_contained = true;
478  for (size_t i = 0; i < simpleNode->ninputs(); ++i)
479  {
480  auto f = frontier.find(simpleNode->input(i));
481  if (f == frontier.end())
482  {
483  all_contained = false;
484  }
485  }
486  if (!all_contained)
487  continue;
488  // all inputs of node are in frontier - move them forward
489  std::vector<size_t> input_cycles;
490  for (size_t i = 0; i < simpleNode->ninputs(); ++i)
491  {
492  input_cycles.push_back(output_cycles[simpleNode->input(i)->origin()]);
493  frontier.erase(simpleNode->input(i));
494  }
495  std::vector<size_t> out_cycles = NodeCycles(simpleNode, input_cycles);
496 
497  if (top_muxes.find(simpleNode) != top_muxes.end())
498  {
499  if (dynamic_cast<const MuxOperation *>(&simpleNode->GetOperation()))
500  {
501  // TODO: do this in NodeCycles instead?
502  // this works for most cases, but is not ideal if the backedge has an II > 1, and the
503  // predicate hasn't
504  auto pred_latency = output_cycles[simpleNode->input(0)->origin()];
505  auto input_latency = output_cycles[simpleNode->input(1)->origin()];
506  auto backedge_latency = output_cycles[simpleNode->input(2)->origin()];
507  auto out_latency = backedge_latency - pred_latency + input_latency;
508  std::cout << "top_mux " << simpleNode << " pred latency: " << pred_latency
509  << " input latency: " << input_latency
510  << " backedge latency: " << backedge_latency
511  << " out latency: " << out_latency << std::endl;
512  output_cycles[simpleNode->output(0)] = out_latency;
513  }
514  else
515  {
516  JLM_ASSERT(rvsdg::is<LoopConstantBufferOperation>(simpleNode->GetOperation()));
517  // don't update output cycles
518  }
519  }
520  else
521  {
522  for (size_t i = 0; i < simpleNode->noutputs(); ++i)
523  {
524  auto out = simpleNode->output(i);
525  output_cycles[out] = out_cycles[i];
526  frontier.insert(&out->SingleUser());
527  }
528  }
529  changed = true;
530  break;
531  }
532  else if (auto be = dynamic_cast<BackEdgeResult *>(in))
533  {
534  frontier.erase(in);
535  auto out = be->argument();
536  if (stream_backedges.find(be) == stream_backedges.end())
537  {
538  // skip stream backedges
539  output_cycles[out] = output_cycles[in->origin()];
540  frontier.insert(&out->SingleUser());
541  }
542  changed = true;
543  break;
544  }
545  else if (auto rr = dynamic_cast<rvsdg::RegionResult *>(in))
546  {
547  frontier.erase(in);
548  auto out = rr->output();
549  JLM_ASSERT(out);
550  output_cycles[out] = output_cycles[in->origin()];
551  // don't continue frontier out of loop
552  changed = true;
553  break;
554  }
555  else
556  {
557  auto inner_loop = rvsdg::TryGetOwnerNode<LoopNode>(*in);
558  JLM_ASSERT(inner_loop);
559  bool all_contained = true;
560  for (size_t i = 0; i < inner_loop->ninputs(); ++i)
561  {
562  auto f = frontier.find(inner_loop->input(i));
563  if (f == frontier.end())
564  {
565  all_contained = false;
566  }
567  }
568  if (!all_contained)
569  continue;
570  for (size_t i = 0; i < inner_loop->ninputs(); ++i)
571  {
572  frontier.erase(inner_loop->input(i));
573  }
574  // TODO: do we just want the latency of a single iteration here?
575  CalculateLoopCycleDepth(inner_loop, output_cycles, true);
576  for (size_t i = 0; i < inner_loop->noutputs(); ++i)
577  {
578  std::cout << "output latency " << i << " " << output_cycles[inner_loop->output(i)]
579  << std::endl;
580  frontier.insert(&inner_loop->output(i)->SingleUser());
581  }
582  changed = true;
583  break;
584  }
585  }
586  } while (changed);
587  // TODO: is "changed" even necessary or can we just wait for frontier to be empty?
588  if (!frontier.empty())
589  {
590  std::unordered_map<rvsdg::Output *, std::string> o_color;
591  std::unordered_map<rvsdg::Input *, std::string> i_color;
592  for (auto i : frontier)
593  {
594  i_color.insert({ i, "red" });
595  }
596  }
597  JLM_ASSERT(frontier.empty());
598 }
599 
600 void
602  LoopNode * loop,
603  std::unordered_map<rvsdg::Output *, size_t> & output_cycles,
604  bool analyze_inner_loop)
605 {
606  if (!analyze_inner_loop)
607  {
608  for (size_t i = 0; i < loop->ninputs(); ++i)
609  {
610  auto in = loop->input(i);
611  output_cycles[in->origin()] = 0;
612  }
613  }
614  std::unordered_set<rvsdg::Input *> frontier;
615  std::unordered_set<BackEdgeResult *> stream_backedges;
616  std::unordered_set<rvsdg::SimpleNode *> top_muxes;
617  CreateLoopFrontier(loop, output_cycles, frontier, stream_backedges, top_muxes);
618  std::unordered_set<rvsdg::Input *> frontier2(frontier);
619  std::cout << "CalculateLoopCycleDepth(" << loop << ", " << analyze_inner_loop << ")" << std::endl;
620  /* the reason for having two iterations here is a loop value being updated at the end of the loop,
621  * for example the nextRow in SPMV. In theory more iterations could be necessary, until things
622  * only increase by the iterative intensity.
623  */
624  // TODO: should there be more iterations of this? We could iterate until there is no more change
625  // in the difference. This would also give us the II
626  PushCycleFrontier(output_cycles, frontier, stream_backedges, top_muxes);
627 
628  std::unordered_map<rvsdg::Output *, std::string> o_color;
629  std::unordered_map<rvsdg::Input *, std::string> i_color;
630  std::unordered_map<rvsdg::Output *, std::string> tail_label;
631  if (!analyze_inner_loop)
632  {
633  for (auto i : frontier2)
634  {
635  i_color.insert({ i, "red" });
636  }
637  for (auto [o, l] : output_cycles)
638  {
639  tail_label[o] = std::to_string(l);
640  }
641  }
642  std::cout << "second iteration" << std::endl;
643  PushCycleFrontier(output_cycles, frontier2, stream_backedges, top_muxes);
644  if (!analyze_inner_loop)
645  {
646  for (auto [o, l] : output_cycles)
647  {
648  tail_label[o] = std::to_string(l);
649  }
650  }
651 }
652 
653 void
654 setMemoryLatency(size_t memoryLatency)
655 {
656  MemoryLatency = memoryLatency;
657 }
658 
659 const size_t MaximumBufferSize = 512;
660 
661 static size_t
662 PlaceBufferLoop(rvsdg::Output * out, size_t min_capacity, bool passThrough)
663 {
664  // places or re-places a buffer on an output
665  // don't place buffers after constants
666  JLM_ASSERT(!is_constant(rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*out)));
667  auto [forkNode, forkOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<ForkOperation>(*out);
668  JLM_ASSERT(!(forkOperation && forkOperation->IsConstant()));
669 
670  if (rvsdg::is<rvsdg::LambdaOperation>(out->region()->node()))
671  {
672  // don't place buffers outside loops
673  return min_capacity;
674  }
675 
676  auto arg = dynamic_cast<rvsdg::RegionArgument *>(out);
677  if (arg && arg->input())
678  {
679  return PlaceBufferLoop(arg->input()->origin(), min_capacity, passThrough);
680  }
681 
682  // push buf above loop_const_buf
683  if (auto [loopConstantNode, op] =
684  rvsdg::TryGetSimpleNodeAndOptionalOp<LoopConstantBufferOperation>(*out);
685  op)
686  {
687  return std::min(
688  PlaceBufferLoop(loopConstantNode->input(0)->origin(), min_capacity, passThrough),
689  PlaceBufferLoop(loopConstantNode->input(1)->origin(), min_capacity, passThrough));
690  }
691 
692  if (auto [node, bufferOperation] = rvsdg::TryGetSimpleNodeAndOptionalOp<BufferOperation>(*out);
693  bufferOperation)
694  {
695  // replace buffer and keep larger size
696  passThrough = passThrough && bufferOperation->IsPassThrough();
697  size_t capacity = round_up_pow2(bufferOperation->Capacity() + min_capacity);
698  // if the maximum buffer size is exceeded place a smaller buffer, but pretend a large one was
699  // placed, to prevent additional buffers further down
700  auto actual_capacity = std::min(capacity, MaximumBufferSize);
701  auto bufOut =
702  BufferOperation::create(*node->input(0)->origin(), actual_capacity, passThrough)[0];
703  node->output(0)->divert_users(bufOut);
704  JLM_ASSERT(node->IsDead());
705  remove(node);
706  return capacity;
707  }
708  else
709  {
710  // create new buffer
711  auto & directUser = *out->Users().begin();
712  size_t capacity = round_up_pow2(min_capacity);
713  // if the maximum buffer size is exceeded place a smaller buffer, but pretend a large one was
714  // placed, to prevent additional buffers further down
715  auto actual_capacity = std::min(capacity, MaximumBufferSize);
716  auto newOut = BufferOperation::create(*out, actual_capacity, passThrough)[0];
717  directUser.divert_to(newOut);
718  return capacity;
719  }
720 }
721 
722 static void
724  LoopNode * loop,
725  std::unordered_map<rvsdg::Output *, size_t> & output_cycles,
726  std::unordered_map<rvsdg::Output *, size_t> & buffer_capacity,
727  bool analyze_inner_loop = false)
728 {
729  if (!analyze_inner_loop)
730  {
731  for (size_t i = 0; i < loop->ninputs(); ++i)
732  {
733  auto in = loop->input(i);
734  buffer_capacity[in->origin()] = 0;
735  }
736  }
737  std::unordered_set<rvsdg::Input *> frontier;
738  std::unordered_set<BackEdgeResult *> stream_backedges;
739  std::unordered_set<rvsdg::SimpleNode *> top_muxes;
740  CreateLoopFrontier(loop, buffer_capacity, frontier, stream_backedges, top_muxes);
741  // set buffer capacity for constant nodes to max
742  for (auto & tn : loop->subregion()->TopNodes())
743  {
744  auto out = tn.output(0);
745  // don't use size_t max, since that is used to signal down below
746  buffer_capacity[out] = UnlimitedBufferCapacity;
747  }
748  // same for inputs - we only care what happens within the loop
749  for (size_t i = 0; i < loop->ninputs(); ++i)
750  {
751  auto in = loop->input(i);
752  buffer_capacity[in->origin()] = UnlimitedBufferCapacity;
753  buffer_capacity[in->arguments.begin().ptr()] = UnlimitedBufferCapacity;
754  }
755  // TODO: unlimited buffers for stream backedges, but loose that property if a fork is reached?
756  // this might also make the current special case handling of addrqs unnecessary
757 
758  std::unordered_map<rvsdg::Output *, std::string> o_color;
759  std::unordered_map<rvsdg::Input *, std::string> i_color;
760  std::unordered_map<rvsdg::Output *, std::string> tail_label;
761  if (!analyze_inner_loop)
762  {
763  for (auto i : frontier)
764  {
765  i_color.insert({ i, "red" });
766  }
767  for (auto [o, l] : buffer_capacity)
768  {
769  tail_label[o] = std::to_string(l);
770  }
771  }
772 
773  bool changed = false;
774  do
775  {
776  changed = false;
777  for (auto in : frontier)
778  {
779  if (auto simpleNode = rvsdg::TryGetOwnerNode<rvsdg::SimpleNode>(*in))
780  {
781  bool all_contained = true;
782  for (size_t i = 0; i < simpleNode->ninputs(); ++i)
783  {
784  auto f = frontier.find(simpleNode->input(i));
785  if (f == frontier.end())
786  {
787  all_contained = false;
788  }
789  }
790  if (!all_contained)
791  continue;
792  // all inputs of node are in frontier - move them forward
793  size_t max_cycles = 0;
794  for (size_t i = 0; i < simpleNode->ninputs(); ++i)
795  {
796  max_cycles = std::max(max_cycles, output_cycles[simpleNode->input(i)->origin()]);
797  frontier.erase(simpleNode->input(i));
798  }
799 
800  std::vector<size_t> input_capacities;
801  // adjust capacities
802  for (size_t i = 0; i < simpleNode->ninputs(); ++i)
803  {
804  auto capacity = buffer_capacity[simpleNode->input(i)->origin()];
805  if (!analyze_inner_loop && (!rvsdg::is<AddressQueueOperation>(simpleNode))
806  && capacity < max_cycles)
807  {
808  size_t capacity_diff = max_cycles - capacity;
809  capacity += PlaceBufferLoop(simpleNode->input(i)->origin(), capacity_diff, true);
810  buffer_capacity[simpleNode->input(i)->origin()] = capacity;
811  }
812  input_capacities.push_back(capacity);
813  }
814 
815  if (top_muxes.find(simpleNode) != top_muxes.end())
816  {
817  // we reached our starting point again
818  auto mux = dynamic_cast<const MuxOperation *>(&simpleNode->GetOperation());
819  if (mux)
820  {
821  std::cout << "top_mux " << simpleNode
822  << " pred capacity: " << buffer_capacity[simpleNode->input(0)->origin()]
823  << " backedge capacity: " << buffer_capacity[simpleNode->input(2)->origin()]
824  << std::endl;
825  }
826  }
827  else
828  {
829  std::vector<size_t> out_capacities = NodeCapacity(simpleNode, input_capacities);
830  for (size_t i = 0; i < simpleNode->noutputs(); ++i)
831  {
832  auto out = simpleNode->output(i);
833  buffer_capacity[out] = out_capacities[i];
834  JLM_ASSERT(analyze_inner_loop || buffer_capacity[out] >= output_cycles[out]);
835  frontier.insert(&out->SingleUser());
836  }
837  }
838  changed = true;
839  break;
840  }
841  else if (auto be = dynamic_cast<BackEdgeResult *>(in))
842  {
843  frontier.erase(in);
844  auto out = be->argument();
845  buffer_capacity[out] = buffer_capacity[in->origin()];
846  if (stream_backedges.find(be) == stream_backedges.end())
847  {
848  frontier.insert(&out->SingleUser());
849  }
850  changed = true;
851  break;
852  }
853  else if (auto rr = dynamic_cast<rvsdg::RegionResult *>(in))
854  {
855  frontier.erase(in);
856  auto out = rr->output();
857  JLM_ASSERT(out);
858  buffer_capacity[out] = buffer_capacity[in->origin()];
859  // don't continue frontier out of loop
860  changed = true;
861  break;
862  }
863  else
864  {
865  auto inner_loop = rvsdg::TryGetOwnerNode<LoopNode>(*in);
866  JLM_ASSERT(inner_loop);
867  bool all_contained = true;
868  for (size_t i = 0; i < inner_loop->ninputs(); ++i)
869  {
870  auto f = frontier.find(inner_loop->input(i));
871  if (f == frontier.end())
872  {
873  all_contained = false;
874  }
875  }
876  if (!all_contained)
877  continue;
878  // all inputs of node are in frontier - move them forward
879  size_t max_cycles = 0;
880  for (size_t i = 0; i < inner_loop->ninputs(); ++i)
881  {
882  max_cycles = std::max(max_cycles, output_cycles[inner_loop->input(i)->origin()]);
883  frontier.erase(inner_loop->input(i));
884  }
885  // adjust capacities
886  for (size_t i = 0; i < inner_loop->ninputs(); ++i)
887  {
888  auto capacity = buffer_capacity[inner_loop->input(i)->origin()];
889  if (!analyze_inner_loop && capacity < max_cycles)
890  {
891  auto user = &inner_loop->input(i)->arguments.begin().ptr()->SingleUser();
892  auto [muxNode, muxOperation] =
893  rvsdg::TryGetSimpleNodeAndOptionalOp<MuxOperation>(*user);
894  if ((muxOperation && muxOperation->loop)
895  || rvsdg::IsOwnerNodeOperation<LoopConstantBufferOperation>(*user))
896  {
897  size_t capacity_diff = max_cycles - capacity;
898  capacity += PlaceBufferLoop(inner_loop->input(i)->origin(), capacity_diff, true);
899  buffer_capacity[inner_loop->input(i)->origin()] = capacity;
900  }
901  else
902  {
903  // don't put buffers on decouples, streams, and addrq stuff
904  }
905  }
906  }
907 
908  AdjustLoopBuffers(inner_loop, output_cycles, buffer_capacity, true);
909  for (size_t i = 0; i < inner_loop->noutputs(); ++i)
910  {
911  frontier.insert(&inner_loop->output(i)->SingleUser());
912  }
913  changed = true;
914  break;
915  }
916  }
917  } while (changed);
918  // TODO: is "changed" even necessary or can we just wait for frontier to be empty?
919  // benefit of it is we won't get infinite loop in case something violates this
920  JLM_ASSERT(frontier.empty());
921  // TODO: take iterative intensity into account. E.g. we can use half the buffer capacity if II is
922  // 2
923  // TODO: remove buffers on cycles?
924  // TODO: don't place buf2 on const buf if it gos up to another const buf - even through a branch?
925  // TODO: still run buffer resize pass, but without upsizing?
926 
927  if (!analyze_inner_loop)
928  {
929  for (auto [o, l] : buffer_capacity)
930  {
931  tail_label[o] = std::to_string(l);
932  }
933  }
934 }
935 
936 static void
938 {
939  for (auto node : rvsdg::TopDownTraverser(region))
940  {
941  if (auto loop = dynamic_cast<LoopNode *>(node))
942  {
943  // process inner loops first
944  CalculateLoopDepths(loop->subregion());
945  std::unordered_map<rvsdg::Output *, size_t> output_cycles;
946  CalculateLoopCycleDepth(loop, output_cycles);
947  std::unordered_map<rvsdg::Output *, size_t> buffer_capacity;
948  AdjustLoopBuffers(loop, output_cycles, buffer_capacity);
949  }
950  }
951 }
952 
953 BufferInsertion::~BufferInsertion() noexcept = default;
954 
957 {}
958 
959 void
961 {
962  const auto & graph = rvsdgModule.Rvsdg();
963  const auto rootRegion = &graph.GetRootRegion();
964  if (rootRegion->numNodes() != 1)
965  {
966  throw std::logic_error("Root should have only one node now");
967  }
968 
969  const auto lambda = dynamic_cast<const rvsdg::LambdaNode *>(rootRegion->Nodes().begin().ptr());
970  if (!lambda)
971  {
972  throw std::logic_error("Node needs to be a lambda");
973  }
974 
975  AddBuffers(lambda->subregion());
976  MaximizeBuffers(lambda->subregion());
977  CalculateLoopDepths(lambda->subregion());
978 }
979 
980 }
void Run(rvsdg::RvsdgModule &rvsdgModule, util::StatisticsCollector &statisticsCollector) override
Perform RVSDG transformation.
~BufferInsertion() noexcept override
static std::vector< jlm::rvsdg::Output * > create(jlm::rvsdg::Output &value, size_t capacity, bool pass_through=false)
Definition: hls.hpp:438
static std::vector< jlm::rvsdg::Output * > create(jlm::rvsdg::Output &addr, jlm::rvsdg::Output &load_result, size_t capacity)
Definition: hls.hpp:1157
rvsdg::Region * subregion() const noexcept
Definition: hls.hpp:725
Region & GetRootRegion() const noexcept
Definition: graph.hpp:99
Output * origin() const noexcept
Definition: node.hpp:58
Lambda node.
Definition: lambda.hpp:83
bool IsDead() const noexcept
Determines whether the node is dead.
Definition: node.hpp:688
rvsdg::Region * region() const noexcept
Definition: node.hpp:761
size_t ninputs() const noexcept
Definition: node.hpp:609
size_t noutputs() const noexcept
Definition: node.hpp:644
rvsdg::Input & SingleUser() noexcept
Definition: node.hpp:347
rvsdg::Region * region() const noexcept
Definition: node.cpp:151
UsersRange Users()
Definition: node.hpp:354
Represents the argument of a region.
Definition: region.hpp:41
Represents the result of a region.
Definition: region.hpp:120
Represent acyclic RVSDG subgraphs.
Definition: region.hpp:213
RegionArgumentRange Arguments() noexcept
Definition: region.hpp:272
rvsdg::StructuralNode * node() const noexcept
Definition: region.hpp:369
TopNodeRange TopNodes() noexcept
Definition: region.hpp:309
Graph & Rvsdg() noexcept
Definition: RvsdgModule.hpp:57
const SimpleOperation & GetOperation() const noexcept override
Definition: simple-node.cpp:48
NodeInput * input(size_t index) const noexcept
Definition: simple-node.hpp:82
NodeOutput * output(size_t index) const noexcept
Definition: simple-node.hpp:88
StructuralOutput * output(size_t index) const noexcept
StructuralInput * input(size_t index) const noexcept
Represents an RVSDG transformation.
Iterator begin() noexcept
#define JLM_ASSERT(x)
Definition: common.hpp:16
#define JLM_UNREACHABLE(msg)
Definition: common.hpp:43
static size_t MemoryLatency
static void CalculateLoopCycleDepth(LoopNode *loop, std::unordered_map< rvsdg::Output *, size_t > &output_cycles, bool analyze_inner_loop=false)
static std::vector< size_t > NodeCycles(rvsdg::SimpleNode *node, std::vector< size_t > &input_cycles)
static void CreateLoopFrontier(const LoopNode *loop, std::unordered_map< rvsdg::Output *, size_t > &output_cycles, std::unordered_set< rvsdg::Input * > &frontier, std::unordered_set< BackEdgeResult * > &stream_backedges, std::unordered_set< rvsdg::SimpleNode * > &top_muxes)
static void PushCycleFrontier(std::unordered_map< rvsdg::Output *, size_t > &output_cycles, std::unordered_set< rvsdg::Input * > &frontier, std::unordered_set< BackEdgeResult * > &stream_backedges, std::unordered_set< rvsdg::SimpleNode * > &top_muxes)
static void AddBuffers(rvsdg::Region *region)
static constexpr uint32_t round_up_pow2(uint32_t x)
static rvsdg::Input * FindUserNode(rvsdg::Output *out)
Definition: add-buffers.cpp:17
static void divert_users(jlm::rvsdg::Output *output, Context &ctx)
Definition: cne.cpp:504
static bool is_constant(const rvsdg::Node *node)
Definition: rvsdg2rhls.hpp:20
static void OptimizeBuffer(rvsdg::SimpleNode *node)
static void PlaceBuffer(rvsdg::Output *out, size_t capacity, bool passThrough)
Definition: add-buffers.cpp:52
const size_t UnlimitedBufferCapacity
static void CalculateLoopDepths(rvsdg::Region *region)
static std::vector< size_t > NodeCapacity(rvsdg::SimpleNode *node, std::vector< size_t > &input_capacities)
static size_t PlaceBufferLoop(rvsdg::Output *out, size_t min_capacity, bool passThrough)
static void OptimizeLoop(LoopNode *loopNode)
static void OptimizeAddrQ(rvsdg::SimpleNode *node)
Definition: add-buffers.cpp:92
void setMemoryLatency(size_t memoryLatency)
static void MaximizeBuffers(rvsdg::Region *region)
static void AdjustLoopBuffers(LoopNode *loop, std::unordered_map< rvsdg::Output *, size_t > &output_cycles, std::unordered_map< rvsdg::Output *, size_t > &buffer_capacity, bool analyze_inner_loop=false)
const size_t MaximumBufferSize
static void remove(Node *node)
Definition: region.hpp:932