@article{Dally:1986p339, author = {William J Dally and Charles L Seitz}, journal = {Journal of Parallel and Distributed Computing}, title = {The Torus Routing Chip}, abstract = {The torus routing chip (TRC) is a self-timed chip that performs deadlock-free cut-through routing in k-ary n-cube multiprocessor interconnection networks using a new method of deadlock avoidance called virtual channels. A prototype TRC with byte wide self-timed communication channels achieved on first silicon a throughput of 64 Mbits/s in each dimension, about an order of magnitude better performance than the communication networks used by machines such as the Caltech Cosmic Cube or Intel iPSC, The latency of the cut-through routing of only 150 ns per routing step largely eliminates message locality considerations in the concurrent programs for such machines. The design and testing of the TRC as a self-timed chip was no more difficult than it would have been for a synchronous chip. }, affiliation = {Department of Computer Science, California Institute of Technology}, number = {4}, pages = {187--196}, volume = {1}, year = {1986}, doi = {10.1007/BF01660031}, url = {http://www.springerlink.com/content/ulp36l24158gwh44/fulltext.pdf} }
@article{Dally:1987p11, author = {William J Dally and Charles L Seitz}, journal = {IEEE Transactions on Computers}, title = {Deadlock-free message routing in multiprocessor interconnection networks}, abstract = {A deadlock-free routing algorithm can be generated for arbitrary interconnection networks using the concept of virtual channels. A necessary and sufficient condition for deadlock-free routing is the absence of cycles in a channel dependency graph. Given an arbitrary network and a routing function, the cycles of the channel dependency graph can be removed by splitting physical channels into groups of virtual channels. This method is used to develop deadlock-free routing algorithms for k-ary n-cubes, for cube-connected cycles, and for shuffle-exchange networks.}, affiliation = {Artificial Intelligence Laboratory and Laboratory for Computer Science, Massachusetts Institute of Technology; Department of Computer Science, California Institute of Technology}, number = {5}, pages = {547--553}, volume = {36}, year = {1987}, doi = {10.1109/TC.1987.1676939}, url = {http://ieeexplore.ieee.org/iel5/12/35264/01676939.pdf?tp=&arnumber=1676939&isnumber=35264} }
@article{Dally:1990p337, author = {William J Dally}, journal = {IEEE Transactions on Computers}, title = {Performance Analysis of k-ary n-cube Interconnection Networks}, abstract = {VLSI communication networks are wire-limited, i.e. the cost of a network is not a function of the number of switches required, but rather a function of the wiring density required to construct the network. Communication networks of varying dimensions are analyzed under the assumption of constant wire bisection. Expressions for the latency, average case throughput, and hot-spot throughput of k-ary n -cube networks with constant bisection that agree closely with experimental measurements are derived. It is shown that low-dimensional networks (e.g. tori) have lower latency and higher hot-spot throughput than high-dimensional networks (e.g. binary n-cubes) with the same bisection width.}, affiliation = {Artificial Intelligence Laboratory and Laboratory for Computer Science, Massachusetts Institute of Technology}, number = {6}, pages = {775--785}, volume = {39}, year = {1990}, doi = {10.1109/12.53599}, url = {http://ieeexplore.ieee.org/iel1/12/1935/00053599.pdf?tp=&arnumber=53599&isnumber=1935} }
@article{Dally:1991p336, author = {William J Dally}, journal = {IEEE Transactions on Computers}, title = {Express Cubes: Improving the Performance of k-ary n-cube Interconnection Networks}, abstract = {The author discusses express cubes, k-ary n-cube interconnection networks augmented by express channels that provide a short path for nonlocal messages. An express cube combines the logarithmic diameter of a multistage network with the wire-efficiency and ability to exploit locality of a low-dimensional mesh network. The insertion of express channels reduces the network diameter and thus the distance component of network latency. Wire length is increased, allowing networks to operate with latencies that approach the physical speed-of-light limitation rather than being limited by node delays. Express channels increase wire bisection in a manner that allows the bisection to be controlled independently of the choice of radix, dimension, and channel width. By increasing wire bisection to saturate the available wiring media, throughput can be substantially increased. With an express cube both latency and throughput are wire-limited and within a small factor of the physical limit on performance.}, affiliation = {Artificial Intelligence Laboratory and Laboratory for Computer Science, Massachusetts Institute of Technology}, number = {9}, pages = {1016--1023}, volume = {40}, year = {1991}, doi = {10.1109/12.83652}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=83652&isnumber=2730} }
@article{Dally:1992p335, author = {William J Dally}, journal = {IEEE Transactions on Parallel and Distributed Systems}, title = {Virtual-channel flow control}, abstract = {Network throughput can be increased by dividing the buffer storage associated with each network channel into several virtual channels. Each physical channel is associated with several small queues, virtual channels, rather than a single deep queue. The virtual channels associated with one physical channel are allocated independently but compete with each other for physical bandwidth. Virtual channels decouple buffer resources from transmission resources. This decoupling allows active messages to pass blocked messages using network bandwidth that would otherwise be left idle. The paper studies the performance of networks using virtual channels using both analysis and simulation. These studies show that virtual channels increase network throughput, by a factor of four for 10-stage networks, and reduce the dependence of throughput on the depth of the network.}, affiliation = {Artificial Intelligence Laboratory and Laboratory for Computer Science, Massachusetts Institute of Technology}, number = {2}, pages = {194--205}, volume = {3}, year = {1992}, doi = {10.1109/71.127260}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=127260&isnumber=3563} }
@article{Dally:1993p302, author = {William J Dally and Hiromichi Aoki}, journal = {IEEE Transactions on Parallel and Distributed Systems}, title = {Deadlock-Free Adaptive Routing in Multicomputer Networks Using Virtual Channels}, abstract = {The use of adaptive routing in a multicomputer interconnection network improves network performance by using all available paths and provides fault tolerance by allowing messages to be routed around failed channels and nodes. Two deadlock-free adaptive routing algorithms are described. Both algorithms allocate virtual channels using a count of the number of dimension reversals a packet has performed to eliminate cycles in resource dependency graphs. The static algorithm eliminates cycles in the network channel dependency graph. The dynamic algorithm improves virtual channel utilization by permitting dependency cycles and instead eliminating cycles in the packet wait-for graph. It is proved that these algorithms are deadlock-free. Experimental measurements of their performance are presented.}, affiliation = {Massachusetts Institute of Technology}, number = {4}, pages = {466--475}, volume = {4}, year = {1993}, doi = {10.1109/71.219761}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=219761&isnumber=5755} }
@article{Peh:2001p332, author = {Li-Shiuan Peh and William J Dally}, journal = {IEEE Micro}, title = {A Delay model for Router Microarchitectures}, abstract = {This article introduces a router delay model that takes into account the pipelined nature of contemporary routers and proposes pipelines matched to the specific flow control method employed. Given the type of flow control and router parameters, the model returns router latency in technology-independent units and the number of pipeline stages as a function of cycle time. We apply this model to derive realistic pipelines for wormhole and virtual-channel routers and compare their performance. Contrary to the conclusions of previous models, our results show that the latency of a virtual channel router doesn't increase as we scale the number of virtual channels up to 8 per physical channel. Our simulation results also show that a virtual-channel router gains throughput of up to 40 \% over a wormhole router.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {1}, pages = {26--34}, volume = {21}, year = {2001}, doi = {10.1109/40.903059}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=903059&isnumber=19527} }
@article{Towles:2002p331, author = {Brian Towles and William J Dally}, journal = {Computer Architecture Letters}, title = {Worst-case Traffic for Oblivious Routing Functions}, abstract = {This paper presents an algorithm to find a worst-case traffic pattern for any oblivious routing algorithm on an arbitrary interconnection network topology. The linearity of channel loading offered by oblivious routing algorithms enables the problem to be mapped to a bipartite maximum-weight matching, which can be solved in polynomial time for routing functions with a polynomial number of paths. Finding exact worst case performance was previously intractable, and we demonstrate an example case where traditional characterization techniques overestimate the throughput of a particular routing algorithm by 47\%.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {1}, pages = {4--7}, volume = {1}, year = {2002}, doi = {10.1109/L-CA.2002.12}, url = {http://ieeexplore.ieee.org/iel5/10208/34596/01650106.pdf?tp=&arnumber=1650106&isnumber=34596} }
@article{Towles:2003p325, author = {Brian Towles and William J Dally}, journal = {IEEE/ACM Transactions on Networking}, title = {Guaranteed scheduling for switches with configuration overhead}, abstract = {We present three algorithms that provide performance guarantees for scheduling switches, such as optical switches, with configuration overhead. Each algorithm emulates an unconstrained (zero overhead) switch by accumulating a batch of configuration requests and generating a corresponding schedule for a constrained switch. Speedup is required both to cover the configuration overhead of the switch and to compensate for empty slots left by the scheduling algorithm. Scheduling algorithms are characterized by the number of configurations Ns they require to cover a batch of requests and the speedup required to compensate for empty slots Smin. Initially, all switch reconfiguration is assumed to occur simultaneously. We show that a well-known exact matching algorithm, EXACT, leaves no empty slots (i.e., Smin=1), but requires Ns≈N2 configurations for an N-port switch leading to high configuration overhead or large batches and, hence, high delay. We present two new algorithms that reduce the number of configurations required substantially. MIN covers a batch of requests in the minimum possible number of configurations, Ns=N, but at the expense of many empty slots, Smin≈4log2N. DOUBLE strikes a balance, requiring twice as many configurations, Ns=2N, while reducing the number of empty slots so that Smin=2. Loosening the restriction on reconfiguration times, the scheduling problem is cast as an open shop. The best known practical scheduling algorithm for open shops, list scheduling (LIST), gives the same emulation requirements as DOUBLE. Therefore, we conclude that our architecture gains no advantages from allowing arbitrary switch reconfiguration. Finally, we show that DOUBLE and LIST offer the lowest required speedup to emulate an unconstrained switch across a wide range of port count and delay.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {5}, pages = {835--847}, volume = {11}, year = {2003}, doi = {10.1109/TNET.2003.818190}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1237460&isnumber=27747} }
@article{Singh:2004p322, author = {Arjun Singh and William J Dally}, journal = {Computer Architecture Letters}, title = {Buffer and Delay Bounds in High Radix Interconnection Networks}, abstract = {We apply recent results in queueing theory to propose a methodology for bounding the buffer depth and packet delay in high radix interconnection networks. While most work in interconnection networks has been focused on the throughput and average latency in such systems, few studies have been done providing statistical guarantees for buffer depth and packet delays. These parameters are key in the design and performance of a network. We present a methodology for calculating such bounds for a practical high radix network and through extensive simulations show its effectiveness for both bursty and non-bursty injection traffic. Our results suggest that modest speedups and buffer depths enable reliable networks without flow control to be constructed.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {1}, pages = {8--11}, volume = {3}, year = {2004}, doi = {10.1109/L-CA.2004.2}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1650129&isnumber=34598} }
@article{Singh:2004p324, author = {Arjun Singh and William J Dally and Brian Towles and Amit K Gupta}, journal = {Computer Architecture Letters}, title = {Globally Adaptive Load-Balanced Routing on Tori}, abstract = {We introduce a new method of adaptive routing on k-ary n-cubes, Globally Adaptive Load-Balance (GAL). GAL makes global routing decisions using global information. In contrast, most previous adaptive routing algorithms make local routing decisions using local information (typically channel queue depth). GAL senses global congestion using segmented injection queues to decide the directions to route in each dimension. It further load balances the network by routing in the selected directions adaptively. Using global information, GAL achieves the performance (latency and throughput) of minimal adaptive routing on benign traffic patterns and performs as well as the best obliviously load-balanced routing algorithm (GOAL) on adversarial traffic.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {1}, pages = {2--5}, volume = {3}, year = {2004}, doi = {10.1109/L-CA.2004.8}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1650123&isnumber=34598} }
@article{Gupta:2006p321, author = {Amit K Gupta and William J Dally}, journal = {Computer Architecture Letters}, title = {Topology Optimization of Interconnection Networks}, abstract = {This paper describes an automatic optimization tool that searches a family of network topologies to select the topology that best achieves a specified set of design goals while satisfying specified packaging constraints. Our tool uses a model of signaling technology that relates bandwidth, cost and distance of links. This model captures the distance-dependent bandwidth of modern high-speed electrical links and the cost differential between electrical and optical links. Using our optimization tool, we explore the design space of hybrid Clos-torus (C-T) networks. For a representative set of packaging constraints we determine the optimal hybrid C-T topology to minimize cost and the optimal C-T topology to minimize latency for various packet lengths. We then use the tool to measure the sensitivity of the optimal topology to several important packaging constraints such as pin count and critical distance.}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {1}, pages = {10--13}, volume = {5}, year = {2006}, doi = {10.1109/L-CA.2006.8}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1650135&isnumber=34600} }
@article{Owens:2007p340, author = {John D Owens and William J Dally and Ron Ho and D N Jayasimha and Stephen W Keckler and Li-Shiuan Peh}, journal = {IEEE Micro}, title = {Research Challenges for On-Chip Interconnection Networks}, abstract = {On-chip interconnection networks are rapidly becoming a key enabling technology for commodity multicore processors and SoCs common in consumer embedded systems, the National Science Foundation initiated a workshop that addressed upcoming research issues in OCIN technology, design, and implementation and set a direction for researchers in the field.}, affiliation = {University of California, Davis; Stanford University; Sun Microsystems; Intel Corporation; University of Texas at Austin; Princeton University}, number = {5}, pages = {96--108}, volume = {27}, year = {2007}, doi = {10.1109/MM.2007.4378787}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4378787&isnumber=4378774} }
@inproceedings{Dally:1987p338, author = {William J Dally}, booktitle = {Proceedings of the 1987 Stanford Conference on Advanced Research in VLSI}, title = {Wire-efficient VLSI Multiprocessor Communication Networks}, pages = {391--415}, year = {1987} }
@inproceedings{Dally:1990p10, author = {William J Dally}, booktitle = {ISCA '90: Proceedings of the 17th annual International Symposium on Computer Architecture}, title = {Virtual-channel flow control}, abstract = {Network throughput can be increased by dividing the buffer storage associated with each network channel into several virtual channels [DalSei]. Each physical channel is associated with several small queues, virtual channels, rather than a single deep queue. The virtual channels associated with one physical channel are allocated independently but compete with each other for physical bandwidth. Virtual channels decouple buffer resources from transmission resources. This decoupling allows active messages to pass blocked messages using network bandwidth that would otherwise be left idle. Simulation studies show that, given a fixed amount of buffer storage per link, virtual-channel flow control increases throughput by a factor of 3.5, approaching the capacity of the network.}, affiliation = {Artificial Intelligence Laboratory and Laboratory for Computer Science, Massachusetts Institute of Technology}, pages = {60--68}, year = {1990}, doi = {10.1145/325164.325115}, url = {http://portal.acm.org/ft_gateway.cfm?id=325115&type=pdf&coll=GUIDE&dl=GUIDE&CFID=38846775&CFTOKEN=95618102} }
@inproceedings{Peh:2000p333, author = {Li-Shiuan Peh and William J Dally}, booktitle = {HOTI '00: Proceedings of the 8th Symposium on High Performance Interconnects}, title = {A Delay Model for Router Microarchitectures}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2000} }
@inproceedings{Peh:2000p334, author = {Li-Shiuan Peh and William J Dally}, booktitle = {HPCA '00: Proceedings of the Sixth International Symposium on High-Performance Computer Architecture}, title = {Flit-Reservation Flow Control}, abstract = {This paper presents flit-reservation flow control, in which control flits traverse the network in advance of data flits, reserving buffers and channel bandwidth. Flit-reservation flow control requires control flits to precede data flits, which can be realized through fast on-chip control wires or the pipelining of control flits one or more cycles ahead of data flits. Scheduling ahead of data at rival enables buffers to be held only during actual buffer usage, unlike existing flow control methods. It also eliminates data latency due to routing and arbitration decisions. Simulations with fast control wires show that flit-reservation flow control extends the 63\% throughput attained by virtual-channel flow control with 8 flit buffers per input to 77\%, an improvement of 20\% with equal storage and bandwidth overheads. Its throughput with 6 buffers (77\%) approaches that of virtual-channel flow control using 16 buffers (80\%), reflecting the significant buffer savings as a result of efficient buffer utilization. Data latency is also reduced by 15.6\% as compared to virtual-channel flow control. The improvement in throughput is similarly realized by the pipelining of each control flit a cycle ahead of their data flits, using control and data networks with the same propagation delay of 1 cycle.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {73--84}, year = {2000}, doi = {10.1109/HPCA.2000.824340}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=824340&isnumber=17815} }
@inproceedings{Dally:2001p8, author = {William J Dally and Brian Towles}, booktitle = {DAC '01: Proceedings of the 38th Conference on Design Automation}, title = {Route Packets, Not Wires: On-Chip Inteconnection Networks}, abstract = {Using on-chip interconnection networks in place of ad-hoc glo-bal wiring structures the top level wires on a chip and facilitates modular design. With this approach, system modules (processors, memories, peripherals, etc...) communicate by sending packets to one another over the network. The structured network wiring gives well-controlled electrical parameters that eliminate timing iterations and enable the use of high-performance circuits to reduce latency and increase bandwidth. The area overhead required to implement an on-chip network is modest, we estimate 6.6\%. This paper introduces the concept of on-chip networks, sketches a simple network, and discusses some challenges in the architecture and design of these networks.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {684--689}, year = {2001}, doi = {10.1145/378239.379048}, url = {http://portal.acm.org/ft_gateway.cfm?id=379048&type=pdf&coll=GUIDE&dl=GUIDE&CFID=38846775&CFTOKEN=95618102} }
@inproceedings{Peh:2001p6, author = {Li-Shiuan Peh and William J Dally}, booktitle = {HPCA '01: Proceedings of the Seventh International Symposium on High-Performance Computer Architecture}, title = {A Delay Model and Speculative Architecture for Pipelined Routers}, abstract = {This paper introduces a router delay model that accurately models key aspects of modern routers.The model accounts for the pipeline nature of contemporary routers,the specific flow control method employed,the delay of the flow- control credit path,and the sharing of crossbar ports across virtual channels.Motivate by this model,we introduce a microarchitecture for a speculative virtual-channel router that significantly reduces its router latency to that of a wormhole router.Simulations using our pipelined model give results that differ considerably from the commonly-assumed unit-latency 'model which is unreasonably optimistic.Using realistic pipeline models,we compare wormhole [6] and virtual-channel flow control [4]. Our results show that a speculative virtual-channel router has the same per-hop router latency as a wormhole router,while improving throughput by up to 40\%.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {255--266}, year = {2001}, doi = {10.1109/HPCA.2001.903268}, url = {http://cva.stanford.edu/publications/2001/specmodel.pdf} }
@inproceedings{Gupta:2002p328, author = {Amit K Gupta and William J Dally and Arjun Singh and Brian Towles}, booktitle = {HOTI '02: Proceedings of the 10th Symposium on High Performance Interconnects}, title = {Scalable Opto-Electronic Network (SOENet)}, abstract = {In applications such as processor-memory interconnect, I/O networks, and router switch fabrics, an interconnection network must be scalable to thousands of high-bandwidth terminals while at the same time being economical in small configurations and robust in the presence of single-point faults. Emerging optical technology enables new topologies by allowing links to cover large distances but at a significant premium in cost compared to high-speed electrical links. Existing topologies do not cost-effectively exploit these optical links. In this paper we introduce SOENet, a family of topologies that exploits emerging high-speed optical and electrical links to provide cost effective scalability, and graceful degradation in the presence of faults. We show that SOENet scales more economically than alternative topologies. For networks scalable to 32,000 nodes, a 32-node SOENet costs 4x less than a 3-D torus. Finally we investigate the fault tolerance properties of these networks and show that they degrade more gracefully in the presence of faults than alternative topologies.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {71--76}, year = {2002}, doi = {10.1109/CONECT.2002.1039259}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1039259&isnumber=22275} }
@inproceedings{Towles:2002p326, author = {Brian Towles and William J Dally}, booktitle = {INFOCOM '02: Proceedings of the 21st Annual Joint Conference of the IEEE Computer and Communications Societies}, title = {Guaranteed scheduling for switches with configuration overhead}, abstract = {We present three algorithms that provide- performance guarantees for scheduling switches, such as optical switches, with configuration overhead. Each algorithm emulates an unconstrained (zero overhead) switch by accumulating a batch of configuration requests and generating a corresponding schedule for a constrained switch. Speedup is required both to cover the configuration overhead of the switch and to compensate for empty slots left by the scheduling algorithm. Scheduling algorithms are characterized by the number of configurations, Ns, they require to cover a batch of requests, and the speedup required to compensate for empty slots, Smin. We show that a well known exact matching algorithm, EXACT, leaves no empty slots (i.e. Smin=1), but requires Ns≈N2 configurations for an N-port switch leading to high overhead or large batches and hence high delay. We present two new algorithms that reduce the number of configurations required substantially. MIN covers a batch of requests in the minimum possible number of configurations, Ns=N, but at the expense of many empty slots, Smin≈4log2 N. DOUBLE strikes a balance, requiring twice as many configurations, Ns=2N, while reducing the number of empty slots so that Smin=2. We show that DOUBLE offers the lowest required speedup to emulate an unconstrained switch across a wide range of port count and delay.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {342--351}, year = {2002}, doi = {10.1109/INFCOM.2002.1019276}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1019276&isnumber=21921} }
@inproceedings{Towles:2002p330, author = {Brian Towles and William J Dally}, booktitle = {SPAA '02: Proceedings of the 14th Annual ACM Symposium on Parallelism in Algorithms and Architectures}, title = {Worst-case Traffic for Oblivious Routing Functions}, abstract = {This paper presents an algorithm to find a worst-case traffic pattern for any oblivious routing algorithm on an arbitrary interconnection network topology. The linearity of channel loading offered by oblivious routing algorithms enables the problem to be mapped to a bipartite maximum-weight matching, which can be solved in polynomial time for most practical routing functions. Finding exact worst-case performance was previously intractable, and we demonstrate an example case where traditional characterization techniques overestimate the throughput of a particular routing algorithm by 47\%.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {1--8}, year = {2002}, doi = {10.1145/564870.564872}, url = {http://portal.acm.org/ft_gateway.cfm?id=564872&type=pdf&coll=Portal&dl=GUIDE&CFID=15425253&CFTOKEN=38064353} }
@inproceedings{Singh:2002p329, author = {Arjun Singh and William J Dally and Brian Towles and Amit K Gupta}, booktitle = {SPAA '02: Proceedings of the 14th Annual ACM Symposium on Parallelism in Algorithms and Architectures}, title = {Locality-Preserving Randomized Oblivious Routing on Torus Networks}, abstract = {We introduce Randomized Local Balance (RLB), a routing algorithm that strikes a balance between locality and load balance in torus networks, and analyze RLB's performance for benign and adversarial traffic permutations. Our results show that RLB outperforms deterministic algorithms (25\% more bandwidth than Dimension Order Routing) and minimal oblivious algorithms (50\% more bandwidth than 2 phase ROMM [9]) on worst-case traffic. At the same time, RLB offers higher throughput on local traffic than a fully randomized algorithm (4.6 times more bandwidth than VAL (Valiant's algorithm) [15] in the best case). RLBth (RLB threshold) improves the locality of RLB to match the throughput of minimal algorithms on very local traffic in exchange for a 4\% reduction in worst-case throughput compared to RLB. Both RLB and RLBth give better throughput than all other algorithms we tested on randomly selected traffic permutations. While RLB algorithms have somewhat lower guaranteed bandwidth than VAL they have much lower latency at low offered loads (up to 3.65 times less for RLBth).}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {9--13}, year = {2002}, doi = {10.1145/564870.564873}, url = {http://portal.acm.org/ft_gateway.cfm?id=564873&type=pdf&coll=Portal&dl=GUIDE&CFID=15425253&CFTOKEN=38064353} }
@inproceedings{Singh:2003p18, author = {Arjun Singh and William J Dally and Amit K Gupta and Brian Towles}, booktitle = {ISCA '03: Proceedings of the 30th annual International Symposium on Computer Architecture}, title = {GOAL: a load-balanced adaptive routing algorithm for torus networks}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {194--205}, year = {2003}, doi = {10.1145/859618.859641}, url = {http://portal.acm.org/ft_gateway.cfm?id=859641&type=pdf&coll=GUIDE&dl=GUIDE&CFID=2942704&CFTOKEN=46477594} }
@inproceedings{Towles:2003p327, author = {Brian Towles and Stephen P Boyd and William J Dally}, booktitle = {SPAA '03: Proceedings of the 15th Annual ACM Symposium on Parallelism in Algorithms and Architectures}, title = {Throughput-Centric Routing Algorithm Design}, abstract = {The increasing application space of interconnection networks now encompasses several applications, such as packet routing and I/O interconnect, where the throughput of a routing algorithm, not just its locality, becomes an important performance metric. We show that the problem of designing oblivious routing algorithms that have high worst-case or average-case throughput can be cast as a linear program. Globally optimal solutions to these optimization problems can be efficiently found, yielding provably good oblivious routing algorithms. Applying these techniques to k-ary 2-cube (tori) networks shows that previous routing algorithms sacrifice too much locality to achieve optimal worst-case throughput. This motivates the development of two new algorithms, IVAL and 2TURN, which improve locality to within 0.3\% of optimal for an 8-ary 2-cube. Both algorithms have simple, deadlock-free implementations. Expanding the analysis of tori to average-case throughput reveals that there is a weak tradeoff between average-case and worst-case throughput. Specifically, both the IVAL and 2TURN algorithms developed for the worst-case also have good average-case throughput.}, affiliation = {Department of Electrical Engineering, Stanford University}, pages = {200--209}, year = {2003}, doi = {10.1145/777412.777444}, url = {http://portal.acm.org/ft_gateway.cfm?id=777444&type=pdf&coll=Portal&dl=GUIDE&CFID=15425253&CFTOKEN=38064353} }
@inproceedings{Singh:2004p323, author = {Arjun Singh and William J Dally and Amit K Gupta and Brian Towles}, booktitle = {SPAA '04: Proceedings of the 16th Annual ACM Symposium on Parallelism in Algorithms and Architectures}, title = {Adaptive Channel Queue Routing on k-ary n-cubes}, abstract = {This paper introduces a new adaptive method, Channel Queue Routing (CQR), for load-balanced routing on k-ary n-cube interconnection networks. CQR estimates global congestion in the network from its channel queues while relying on the implicit network backpressure to transfer congestion information to these queues. It uses this estimate to decide the directions to route in each dimension. It further load balances the network by routing in the selected directions adaptively. The only other algorithm that uses global congestion in its routing decision is the Globally Adaptive Load-Balance (GAL) algorithm introduced in [13]. GAL performs better than any other known routing algorithm on a wide variety of throughput and latency metrics. However, there are four serious issues with GAL. First, it has very high latency once it starts routing traffic non-minimally. Second, it is slow to adapt to changes in traffic. Third, it requires a complex method to achieve stability. Finally, it is complex to implement. These issues are all related to GAL's use of injection queue length to infer global congestion. CQR uses channel queues rather than injection queues to estimate global congestion. In doing so, it overcomes the limitations of GAL described above while matching its high performance on all the performance metrics described in [13]. CQR gives much lower latency than GAL at loads where non-minimal routing is required. It adapts rapidly to changes in traffic, is unconditionally stable, and is simple to implement.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {11--19}, year = {2004}, doi = {10.1145/1007912.1007915}, url = {http://portal.acm.org/ft_gateway.cfm?id=1007915&type=pdf&coll=Portal&dl=GUIDE&CFID=15425253&CFTOKEN=38064353} }
@inproceedings{Kim:2005p17, author = {John Kim and William J Dally and Brian Towles and Amit K Gupta}, booktitle = {ISCA '05: Proceedings of the 32nd annual International Symposium on Computer Architecture}, title = {Microarchitecture of a High-Radix Router}, abstract = {Evolving semiconductor and circuit technology has greatly increased the pin bandwidth available to a router chip. In the early 90s, routers were limited to 10Gb/s of pin bandwidth. Today 1Tb/s is feasible, and we expect 20Tb/s of I/O bandwidth by 2010. A high-radix router that provides many narrow ports is more effective in converting pin bandwidth to reduced latency and reduced cost than the alternative of building a router with a few wide ports. However, increasing the radix (or degree) of a router raises several challenges as internal switches and allocators scale as the square of the radix. This paper addresses these challenges by proposing and evaluating alternative microarchitectures for high radix routers. We show that the use of a hierarchical switch organization with per-virtual-channel buffers in each subswitch enables an area savings of 40\% compared to a fully buffered crossbar and a throughput increase of 20-60\% compared to a conventional crossbar implementation.}, affiliation = {Computer Systems Laboratory, Stanford University; D.E. Shaw Research and Development}, pages = {420--431}, year = {2005}, doi = {10.1109/ISCA.2005.35}, url = {http://portal.acm.org/ft_gateway.cfm?id=1070005&type=pdf&coll=GUIDE&dl=GUIDE&CFID=2942704&CFTOKEN=46477594} }
@inproceedings{Balfour:2006p1, author = {James Balfour and William J Dally}, booktitle = {ICS '06: Proceedings of the 20th annual International Conference on Supercomputing}, title = {Design Tradeoffs for Tiled CMP On-Chip Networks}, abstract = {We develop detailed area and energy models for on-chip interconnection networks and describe tradeoffs in the design of efficient networks for tiled chip multiprocessors. Using these detailed models we investigate how aspects of the network architecture including topology, channel width, routing strategy, and buffer size affect performance and impact area and energy efficiency. We simulate the performance of a variety of on-chip networks designed for tiled chip multiprocessors implemented in an advanced VLSI process and compare area and energy efficiencies estimated from our models. We demonstrate that the introduction of a second parallel network can increase performance while improving efficiency, and evaluate different strategies for distributing traffic over the subnetworks. Drawing on insights from our analysis, we present a concentrated mesh topology with replicated subnetworks and express channels which provides a 24\% improvement in area efficiency and a 48\% improvement in energy efficiency over other networks evaluated in this study.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {187--198}, year = {2006}, doi = {10.1145/1183401.1183430}, url = {http://portal.acm.org/ft_gateway.cfm?id=1183430&type=pdf&coll=GUIDE&dl=GUIDE&CFID=38831261&CFTOKEN=39727116} }
@inproceedings{Kim:2006p16, author = {John Kim and William J Dally and Dennis Abts}, booktitle = {SC '06: Proceedings of the 2006 ACM/IEEE Conference on Supercomputing}, title = {Adaptive Routing in High-Radix Clos Network}, abstract = {Recent increases in the pin bandwidth of integrated-circuits has motivated an increase in the degree or radix of interconnection network routers. The folded-Clos network can take advantage of these high-radix routers and this paper investigates adaptive routing in such networks. We show that adaptive routing, if done properly, outperforms oblivious routing by providing lower latency, lower latency variance, and higher throughput with limited buffering. Adaptive routing is particularly useful in load balancing around nonuniformities caused by deterministically routed traffic or the presence of faults in the network. We evaluate alternative allocation algorithms used in adaptive routing and compare their performance. The use of randomization in the allocation algorithms can simplify the implementation while sacrificing minimal performance. The cost of adaptive routing, in terms of router latency and area, is increased in high-radix routers. We show that the use of imprecise queue information reduces the implementation complexity and precomputation of the allocations minimizes the impact of adaptive routing on router latency.}, affiliation = {Computer Systems Laboratory, Stanford University; Cray, Inc.}, year = {2006}, doi = {10.1109/SC.2006.10}, url = {http://portal.acm.org/ft_gateway.cfm?id=1188552&type=pdf&coll=GUIDE&dl=GUIDE&CFID=2942704&CFTOKEN=46477594} }
@inproceedings{Scott:2006p320, author = {Steve Scott and Dennis Abts and John Kim and William J Dally}, booktitle = {ISCA '06: Proceedings of the 33rd annual International Symposium on Computer Architecture}, title = {The BlackWidow High-Radix Clos Network}, abstract = {This paper describes the radix-64 folded-Clos network of the Cray BlackWidow scalable vector multiprocessor. We describe the BlackWidow network which scales to 32Kprocessors with a worst-case diameter of seven hops, and the underlying high-radix router micro architecture and its implementation. By using a high-radix router with many narrow channels we are able to take advantage of the higher pin density and faster signaling rates available in modern ASIC technology. The BlackWidow router is an 800 MHz ASIC with 64 18.75Gb/s bidirectional ports for an aggregate off-chip bandwidth of 2.4Tb/s. Each port consists of three 6.25Gb/s differential signals in each direction. The router supports deterministic and adaptive packet routing with separate buffering for request and reply virtual channels. The router is organized hierarchically (Kim et al., 2005) as an 8times8 array of tiles which simplifies arbitration by avoiding long wires in the arbiters. Each tile of the array contains a router port, its associated buffering, and an 8times8 router subswitch. The router ASIC is implemented in a 90nm CMOS standard cell ASIC technology and went from concept to tapeout in 17 months.}, affiliation = {Cray Inc.; Computer Systems Laboratory, Stanford University}, pages = {16--28}, year = {2006}, doi = {10.1109/ISCA.2006.40}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1635937&isnumber=34298} }
@inproceedings{Kim:2007p4, author = {John Kim and James Balfour and William J Dally}, booktitle = {MICRO 40: Proceedings of the 40th Annual IEEE/ACM International Symposium on Microarchitecture}, title = {Flattened Butterfly Topology for On-Chip Networks}, abstract = {With the trend towards increasing number of cores in a multicore processors, the on-chip network that connects the cores needs to scale efficiently. In this work, we propose the use of high-radix networks in on-chip networks and describe how the flattened butterfly topology can be mapped to on-chip networks. By using high-radix routers to reduce the diameter of the network, the flattened butterfly offers lower latency and energy consumption than conventional on-chip topologies. In addition, by properly using bypass channels in the flattened butterfly network, non-minimal routing can be employed without increasing latency or the energy consumption.}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2007}, url = {http://cva.stanford.edu/publications/2007/MICRO_FBFLY.pdf} }
@inproceedings{Kim:2007p5, author = {John Kim and William J Dally and Dennis Abts}, booktitle = {ISCA '07: Proceedings of the 34th annual International Symposium on Computer Architecture}, title = {Flattened Butterfly: A Cost-Efficient Topology for High-Radix Networks}, abstract = {Increasing integrated-circuit pin bandwidth has motivated a corresponding increase in the degree or radix of interconnection networks and their routers. This paper introduces the flattened butterfly, a cost-efficient topology for high-radix networks. On benign (load-balanced) trafic, the flattened butterfly approaches the cost/performance of a butterfly network and has roughly half the cost of a comparable performance Clos network. The advantage over the Clos is achieved by eliminating redundant hops when they are not needed for load balance. On adversarial trafic, the flattened butterfly matches the cost/performance of a folded-Clos network and provides an order of magnitude better performance than a conventional butterfly. In this case, global adaptive routing is used to switch the flattened butterly from minimal to non-minimal routing --- using redundant hops only when they are needed. Minimal and non-minimal, oblivious and adaptive routing algorithms are evaluated on the flattened butterfly. We show that load-balancing adversarial traffic requires non-minimal globally-adaptive routing and show that sequential allocators are required to avoid transient load imbalance when using adaptive routing algorithms. We also compare the cost of the flattened butterfly to folded-Clos, hypercube, and butterfly networks with identical capacity and show that the flattened butterfly is more cost-efficient than folded-Clos and hypercube topologies. }, affiliation = {Computer Systems Laboratory, Stanford University; Cray, Inc.}, pages = {126--137}, year = {2007}, doi = {10.1145/1250662.1250679}, url = {http://portal.acm.org/ft_gateway.cfm?id=1250679&type=pdf&coll=GUIDE&dl=GUIDE&CFID=38846775&CFTOKEN=95618102} }
@inproceedings{Kim:2008p341, author = {John Kim and William J Dally and Steve Scott and Dennis Abts}, booktitle = {ISCA '08: Proceedings of the 35th annual International Symposium on Computer Architecture}, title = {Technology-Driven, Highly-Scalable Dragonfly Topology}, abstract = {Evolving technology and increasing pin-bandwidth motivate the use of high-radix routers to reduce the diameter, latency, and cost of interconnection networks. High-radix networks, however, require longer cables than their low-radix counterparts. Because cables dominate network cost, the number of cables, and particularly the number of long, global cables should be minimized to realize an efficient network. In this paper, we introduce the dragonfly topology which uses a group of high-radix routers as a virtual router to increase the effective radix of the network. With this organization, each minimally routed packet traverses at most one global channel. By reducing global channels, a dragonfly reduces cost by 20\% compared to a flattened butterfly and by 52\% compared to a folded Clos network in configurations with ges 16K nodes.We also introduce two new variants of global adaptive routing that enable load-balanced routing in the dragonfly. Each router in a dragonfly must make an adaptive routing decision based on the state of a global channel connected to a different router. Because of the indirect nature of this routing decision, conventional adaptive routing algorithms give degraded performance. We introduce the use of selective virtual-channel discrimination and the use of credit round-trip latency to both sense and signal channel congestion. The combination of these two methods gives throughput and latency that approaches that of an ideal adaptive routing algorithm.}, affiliation = {Northwestern University; Stanford University; Cray Inc.; Google Inc.}, pages = {77--88}, year = {2008}, doi = {10.1109/ISCA.2008.19}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=4556717&isnumber=4556700} }
@inproceedings{Michelogiannakis:2009p353, author = {George Michelogiannakis and James Balfour and William J Dally}, booktitle = {HPCA '09: Proceedings of the Fifteenth International Symposium on High-Performance Computer Architecture}, title = {Elastic-Buffer Flow Control for On-Chip Networks}, abstract = {This paper presents elastic buffers (EBs), an efficient flow-control scheme that uses the storage already present in pipelined channels in place of explicit input virtual-channel buffers (VCBs). With this approach, the channels themselves act as distributed FIFO buffers. Without VCBs, and hence virtual channels (VCs), deadlock prevention is achieved by duplicating physical channels. We develop a channel occupancy detector to apply universal globally adaptive load-balancing (UGAL) routing to load balance traffic in networks using EBs. Using EBs results in up to 8\% (12\% for low-swing channels) improvement in peak throughput per unit power compared to a VC flow-control network. These gains allow for a wider network datapath to be used to offset the removal of VCBs and increase throughput for a fixed power budget. EB networks have identical zero-load latency to VC networks operating under the same frequency. The microarchitecture of an EB router is considerably simpler than a VC router because allocators and credits are not required. For 5×5 mesh routers, this results in an 18\% improvement in the cycle time.}, affiliation = {Computer Systems Laboratory, Stanford University}, pages = {151--162}, year = {2009}, doi = {10.1109/HPCA.2009.4798250}, url = {http://nocs.stanford.edu/files/publications/mihelog_hpca09.pdf} }
@inproceedings{Jiang:2009p376, author = {Nan Jiang and John Kim and William J Dally}, booktitle = {ISCA '09: Proceedings of the 36th annual International Symposium on Computer Architecture}, title = {Indirect Adaptive Routing on Large Scale Interconnection Networks}, abstract = {Recently proposed high-radix interconnection networks require global adaptive routing to achieve optimum performance. Existing direct adaptive routing methods are slow to sense congestion remote from the source router and hence misroute many packets before such congestion is detected. This paper introduces indirect global adaptive routing (IAR) in which the adaptive routing decision uses information that is not directly available at the source router. We describe four IAR routing methods: credit round trip (CRT), progressive adaptive routing (PAR), piggyback routing (PB), and reservation routing (RES). We evaluate each of these methods on the dragonfly topology under both steady-state and transient loads. Our results show that PB, PAR, and CRT all achieve good performance. PB provides the best absolute performance, with 2-7\% lower latency on steady-state uniform random traffic at 70\% load, while PAR provides the fastest response on transient loads. We also evaluate the implementation costs of the indirect adaptive routing methods and show that PB has the lowest implementation cost requiring <1\% increase in the total storage of a typical high-radix router.}, affiliation = {Computer Systems Laboratory, Stanford University; KAIST}, pages = {220--231}, year = {2009}, doi = {10.1145/1555815.1555783}, url = {http://portal.acm.org/ft_gateway.cfm?id=1555783&type=pdf&coll=GUIDE&dl=GUIDE&CFID=64415435&CFTOKEN=97925745} }
@inproceedings{Michelogiannakis:2009p591, author = {George Michelogiannakis and William J Dally}, booktitle = {SC '09: Proceedings of the 2009 ACM/IEEE Conference on High Performance Computing, Networking, Storage and Analysis}, title = {Router Designs for Elastic Buffer On-Chip Networks}, abstract = {This paper explores the design space of elastic buffer (EB) routers by evaluating three representative designs. We propose an enhanced two-stage EB router which maximizes throughput by achieving a 42\% reduction in cycle time and 20\% reduction in occupied area by using look-ahead routing and replacing the three-slot output EBs in the baseline router of [17] with two-slot EBs. We also propose a singlestage router which merges the two pipeline stages to avoid pipelining overhead. This design reduces zero-load latency by 24\% compared to the enhanced two-stage router if both are operated at the same clock frequency; moreover, the single-stage router reduces the required energy per transferred bit and occupied area by 29\% and 30\% respectively, compared to the enhanced two-stage router. However, the cycle time of the enhanced two-stage router is 26\% smaller than that of the single-stage router.}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2009}, doi = {10.1145/1654059.1654062}, url = {http://nocs.stanford.edu/files/publications/mihelog_sc09.pdf} }
@inproceedings{Becker:2009p590, author = {Daniel U Becker and William J Dally}, booktitle = {SC '09: Proceedings of the 2009 ACM/IEEE Conference on High Performance Computing, Networking, Storage and Analysis}, title = {Allocator Implementations for Network-on-Chip Routers}, abstract = {The present contribution explores the design space for virtual channel (VC) and switch allocators in network-on-chip (NoC) routers. Based on detailed RTL-level implementations, we evaluate representative allocator architectures in terms of matching quality, delay, area and power and investigate the sensitivity of these properties to key network parameters. We introduce a scheme for sparse VC allocation that limits transitions between groups of VCs based on the function they perform, and reduces the VC allocator's delay, area and power by up to 41\%, 90\% and 83\%, respectively. Furthermore, we propose a pessimistic mechanism for speculative switch allocation that reduces switch allocator delay by up to 23\% compared to a conventional implementation without increasing the router's zero-load latency. Finally, we quantify the effects of the various design choices discussed in the paper on overall network performance by presenting simulation results for two exemplary 64-node NoC topologies.}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2009}, doi = {10.1145/1654059.1654112}, url = {http://nocs.stanford.edu/files/publications/dub_sc09.pdf} }
@phdthesis{Peh:2001p319, author = {Li-Shiuan Peh}, title = {Flow Control and Micro-Architectural Mechanisms for Extending the Performance of Interconnection Networks}, school = {Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2001}, url = {http://cva.stanford.edu/publications/2001/lspeh_thesis.pdf} }
@phdthesis{Towles:2005p317, author = {Brian Towles}, title = {Distributed Router Fabrics}, school = {Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2005}, url = {http://cva.stanford.edu/publications/2005/btowles_thesis.pdf} }
@phdthesis{Singh:2005p318, author = {Arjun Singh}, title = {Load-Balanced Routing in Interconnection Networks}, school = {Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2005}, url = {http://cva.stanford.edu/publications/2005/thesis_arjuns.pdf} }
@phdthesis{Kim:2008p342, author = {John Kim}, title = {High-Radix Interconnection Networks}, school = {Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2008}, url = {http://cva.stanford.edu/publications/2008/jkim_thesis.pdf} }
@techreport{Towles:2001p316, author = {Brian Towles}, journal = {CVA Technical Reports}, title = {Finding Worst-case Permutations for Oblivious Routing Algorithms}, abstract = {We present an algorithm to find a worst-case traffic pattern for any oblivious routing algorithm on an arbitrary interconnection network topology. The linearity of channel loading offered by oblivious routing algorithms enables the problem to be mapped to a bipartite maximum-weight matching, which can be solved in polynomial time. Finding exact worst-case performance was previously intractable, and we demonstrate an example case where traditional characterization techniques overestimate the throughput of a particular routing algorithm by 47\%. }, institution = {Concurrent VLSI Architectures Group, Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {121}, year = {2001}, url = {http://cva.stanford.edu/publications/2001/wc_oblivious_TR121.pdf} }
@techreport{Singh:2004p314, author = {Arjun Singh and William J Dally}, journal = {CVA Technical Reports}, title = {Delay and Buffer Bounds in High Radix Interconnection Networks}, abstract = {We apply recent results in queueing theory to propose a methodology for bounding the buffer depth and packet delay in high radix interconnection networks. While most work in interconnection networks has been focused on the throughput and average latency in such systems, few studies have been done providing statistical guarantees for buffer depth, packet reordering and packet delays. These parameters are key in the design and performance of a network. We present a methodology for calculating such bounds for a practical high radix network and through extensive simulations show its effectiveness for both bursty and non-bursty injection traffic. Our results suggest that modest speedups and buffer depths enable reliable networks without flow control to be constructed. }, institution = {Concurrent VLSI Architectures Group, Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2004}, url = {http://cva.stanford.edu/publications/2004/arjun_bounds.pdf} }
@techreport{Singh:2004p315, author = {Arjun Singh and William J Dally}, journal = {CVA Technical Reports}, title = {Globally Adaptive Load-Balanced Routing on k-ary n-cubes}, institution = {Concurrent VLSI Architectures Group, Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, year = {2004} }
@techreport{Michelogiannakis:2008p313, author = {George Michelogiannakis and James Balfour and William J Dally}, journal = {CVA Technical Reports}, title = {Elastic Buffer Networks-on-Chip}, abstract = {This paper presents elastic buffers (EBs), an efficient flow-control scheme that uses the storage already present in pipelined channels in place of explicit input virtual-channel buffers (VCBs). With this approach, the channels themselves act as distributed FIFO buffers under congestion. Without VCBs, and hence virtual channels (VCs), deadlock prevention is achieved by duplicating physical channels. We develop a channel occupancy detector to apply universal globally adaptive load-balancing (UGAL) routing to load balance traffic in networks using EBs. Using EBs results in up to 12 \% (17 \% for low-swing channels) improvement in peak throughput per unit power compared to a network using VC flow control. The power saved by eliminating VCBs is used to make channels wider, providing increased throughput that more than offsets the degradation caused by increased blocking without VCs. EB networks have identical zero-load latency to VC networks and have considerably simpler router logic as a VC allocator is not required. }, institution = {Concurrent VLSI Architectures Group, Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {124}, year = {2008}, url = {http://cva.stanford.edu/publications/2008/EBNoCs.pdf} }
@techreport{Michelogiannakis:2009p392, author = {George Michelogiannakis and William J Dally}, journal = {CVA Technical Report}, title = {Router Designs for Elastic Buffer On-Chip Networks}, abstract = {This paper explores the design space of elastic buffer (EB) routers by evaluating three representative designs. We propose an enhanced two-stage EB router which maximizes throughput by achieving a 42\% reduction in cycle time and 20\% reduction in occupied area by using look-ahead routing and replacing the three-slot output EBs in the baseline router of [12] with two-slot EBs. We also propose a single-stage router which merges the two pipeline stages to avoid pipelining overhead. Compared to it, the enhanced two-stage router has a 32\% increase in zero-load latency when operated at the same clock frequency; moreover, the single-stage router reduces the required energy per transferred bit and occupied area by 29\% and 30\% respectively, compared to the enhanced two-stage router. However, the cycle time of the enhanced two-stage router is 26\% smaller than that of the single-stage router.}, institution = {Concurrent VLSI Architectures Group, Stanford University}, affiliation = {Computer Systems Laboratory, Stanford University}, number = {125}, year = {2009}, url = {http://cva.stanford.edu/publications/2009/EBRouters.pdf} }
This file was generated by bibtex2html 1.92.