Differences

This shows you the differences between two versions of the page.

--- publications:bibtex [2018/11/14 10:31] – sbs
+++ publications:bibtex [2024/09/30 15:13] (current) – sbs
@@ Line 49: / Line 49: @@
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2024
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{SchrKoopSchoARRAY24,
+author = {Schrijvers, Stefan and Koopman, Thomas and Scholz, Sven-Bodo},
+title = {Shray: An Owner-Compute Distributed Shared-Memory System},
+year = {2024},
+isbn = {9798400706202},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+doi = {10.1145/3652586.3663314},
+abstract = {In this paper, we propose a new library for storing arrays
+in a distributed fashion on distributed memory systems. From a programmer's
+perspective, these arrays behave for arbitrary reads as if they were allocated
+in shared memory. When it comes to writes into these arrays, the programmer has to ensure
+that all writes are restricted to a fixed range of address that are "owned"
+by the node executing the writing operation. We show how this design, despite the owner-compute restriction
+can aid programmer productivity by enabling straight-forward parallelisations of typical array-manipulating codes.
+Furthermore, we delineate an open-source implementation of the proposed library named Shray.
+Using the programming interface of Shray, we compare possible hand-parallelised codes of example applications
+with implementations in other DSM/PGAS systems demonstrating the programming style enabled by Shray, and
+providing some initial performance figures.},
+booktitle = {Proceedings of the 10th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
+pages = {25–37},
+numpages = {13},
+keywords = {Array Programming, Distributed Shared Memory, Parallel Computing, Partitioned Global Address Space},
+location = {Copenhagen, Denmark},
+series = {ARRAY 2024}
+}
+@InProceedings{HuijAaldAchtSchoTFP24,
+author={Huijben, Reg and Aaldering, Jordy and Achten, Peter and Scholz, Sven-Bodo},
+editor={Chang, Stephen and Hemann, Jason},
+title={Flattening Combinations of Arrays and Records},
+booktitle={Trends in  Functional Programming},
+year={2024},
+publisher={Springer},
+address={},
+pages={},
+abstract={Flattening is known to be a performance-boosting technique to orchestrate parallel computations on arbitrarily deeply nested arrays.
+In this paper, we propose a flattening transformation that deals with nested data structures that are composed of combinations of arrays and records.
+We choose the functional array programming language SaC as basis for this work, as it already supports flattening of homogeneously nested arrays,
+i.e. arrays in which all elements have the same shape.
+We propose an extension of SaC's syntax for records that allows records and arrays to be used in homogeneously nested form,
+and provide an implementation of this record transformation in the SaC compiler.
+Based on that extension, we show how any legal program that operates with such data structures can be transformed into an equivalent
+one that does not require any records at runtime.},
+isbn={(accepter for publication)}
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2023
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{AaldSchoGasteIFL23,
+author = {Aaldering, Jordy and Scholz, Sven-Bodo and Gastel, Bernard Van},
+title = {Type Patterns: Pattern Matching on Shape-Carrying Array Types},
+year = {2024}, isbn = {9798400716317},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+doi = {10.1145/3652561.3652572},
+abstract = {In this paper we present type patterns: a notation for shape-carrying array types that enables the specification of dependent type signatures while maintaining flexibility and a high level of code readability. Similar notations pre-exist, but we extend them to support rank-polymorphism and specifications of arbitrarily complex constraints between values and types. Furthermore, we enable type patterns to double as a pattern matching mechanism against shapes and shape-components of array arguments, making those values directly available in the corresponding function bodies. While this notation could be used as a basis for a dependently typed language, in our prototypical implementation in the context of SaC we do not require all dependencies to be resolved statically. Instead, we follow a hybrid approach: we map the proposed type patterns into the pre-existing type system of SaC, and we generate additional constraints which we try to statically resolve as far as possible by means of partial evaluation. Any remaining constraints are checked at run-time. We outline our implementation in the context of the SaC ecosystem, and present several examples demonstrating the effectiveness of this hybrid approach based on partial evaluation.}, booktitle = {Proceedings of the 35th Symposium on Implementation and Application of Functional Languages},
+articleno = {11},
+numpages = {14},
+keywords = {Array Programming, Dependent Types, Hybrid Types, Partial Evaluation, Pattern Matching, Rank-Polymorphism, Shape Pattern, Single assignment C, Type Constraints, Type Pattern},
+location = {Braga, Portugal},
+series = {IFL '23} }
+@inproceedings{VerlKoopSchoIFL23,
+author = {Verloop, Michiel and Koopman, Thomas and Scholz, Sven-Bodo},
+title = {Modulo in high-performance code: strength reduction for modulo-based array indexing in loops},
+year = {2024},
+isbn = {9798400716317},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+doi = {10.1145/3652561.3652573},
+abstract = {Modulo operations on array indices provide a convenient means for specifying cyclic operations on arrays, such as rotations. A naive compilation of such operations involves integer divisions, which often can be avoided by partitioning or rearranging iteration spaces. This paper proposes an optimisation to this effect. The key idea is to systematically replace modulo operations within loops by partitioning such loops and replacing the modulo operations with division-free equivalents. We present two different code transformation schemes for the proposed optimisation, provide correctness arguments for both, and present initial performance observations of the effects of these transformations in the context of a compiler for the array language SaC. We look at several benchmarks ranging from micro-benchmarks to some more realistic applications and run them on hardware ranging from multi-core CPUs to GPUs. Depending on the problem size, machine characteristics, and application context, we see direct performance benefits of up to a factor of 25 and even higher indirect benefits due to other optimisations that the proposed transformations enable.},
+booktitle = {Proceedings of the 35th Symposium on Implementation and Application of Functional Languages},
+articleno = {12},
+numpages = {13},
+keywords = {Array Programming, Modulo, Optimisation, Rank-Polymorphism, Rotation, Strength Reduction},
+location = {Braga, Portugal},
+series = {IFL '23}
+}
+@inproceedings{ShinKoopSchoFHPNC23,
+ author = {\v{S}inkarovs, Artjoms and Koopman, Thomas and Scholz, Sven-Bodo},
+ title = {Rank-Polymorphism for Shape-Guided Blocking},
+ year = {2023},
+ isbn = {9798400702969},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ artefact = {https://dl.acm.org/do/10.1145/3580402/},
+ doi = {10.1145/3609024.3609410},
+ abstract = {Many numerical algorithms on matrices or tensors can be formulated in a blocking style which improves performance due to better cache locality. In imperative languages, blocking is achieved by introducing additional layers of loops in a nested fashion alongside with suitable adjustments in index computations. While this process is tedious and error-prone, it is also difficult to implement a generically blocked version that would support arbitrary levels of blocking. At the example of matrix multiply, this paper demonstrates how rank-polymorphic array languages enable the specification of such generically blocked algorithms in a simple, recursive form. The depth of the blocking as well as blocking factors can be encoded in the structure of array shapes. In turn, reshaping arrays makes it possible to switch between blocked and non-blocked arrays. Through rank-polymorphic array combinators, any specification of loop boundaries or explicit index computations can be avoided. Firstly, we propose a dependently-typed framework for rank-polymorphic arrays. We use it to demonstrate that all blocked algorithms can be naturally derived by induction on the argument shapes. Our framework guarantees lack of out-of-bound indexing, and we also prove that all the blocked versions compute the same results as the canonical algorithm. Secondly, we translate our specification to the array language SaC. Not only do we show that we achieve similar conciseness in the implementation, but we also observe good performance of the generated code. We achieve a 7\% improvement compared to the highly-optimised OpenBLAS library, and 3\% compared to Intel’s MKL library when running on a 32-core shared-memory system.},
+ booktitle = {Proceedings of the 11th ACM SIGPLAN International Workshop on Functional High-Performance and Numerical Computing},
+ pages = {1--14},
+ numpages = {14},
+ keywords = {Matrix Multiply, Dependent Types, Array Programming, HPC, Rank-Polymorphism},
+ location = {Seattle, WA, USA},
+ category = {par,apps},
+ series = {FHPNC 2023},
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2022
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{BuerSchoIFL22,
+author = {van Buerden, Patrick and Scholz, Sven-Bodo},
+title = {On Generating Out-Of-Core GPU Code for Multi-Dimensional Array Operations},
+year = {2023},
+isbn = {978-1-4503-9831-2/22/08},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {beurdenscholzifl22.pdf},
+doi = {10.1145/3587216.3587223},
+abstract = {This paper presents the first results of our experiments for generat- ing CUDA code that streams array operations over the elements of its array arguments from high-level specifications. We look at two classes of memory-bound array operations: map-like operations and iterative stencil computations. We investigate code patterns that stream the arguments of these operations from the host through the GPU and back taking the iterative nature of our experiments into ac- count. We show that this setup does not only enable computations on arrays that are so big that they do not fit into the device memory of a single GPU (hence “out-of-core“), but we also demonstrate that the proposed streaming code outperforms non-streaming code ver- sions even for smaller array sizes. For both application patterns, we observe memory throughputs that are beyond 80% of the hardware capability, irrespective of the problem sizes.},
+summary = {This paper shows the power of streaming for GPUs.},
+category = {par},
+booktitle = {Proceedings of the 34nd Symposium on Implementation and Application of Functional Languages},
+pages = {??–??},
+numpages = {12},
+keywords = {code generation, GPUs, CUDA, streaming, memory management, out-of-core computation},
+location = {Copenhagen, Denmark},
+series = {IFL 2022}
+}
+@inproceedings{SinkSchoARRAY22,
+ author = {\v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
+ title = {Parallel Scan as a Multidimensional Array Problem},
+ year = {2022},
+ isbn = {9781450392693},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ url = {SinkarovsScholzARRAY22.pdf},
+ doi = {10.1145/3520306.3534500},
+ summary = {The true power of rank-polymorphism at the example of scan.},
+ category = {apps, par},
+ abstract = {For many algorithms, it is challenging to identify a suitable parallel version, as the design space is typically very large. In this paper we demonstrate how rank-polymorphic array languages can be used as a tool to explore such design spaces through concise high-level specifications. If input data can be organised into a multi-dimensional array, and the algorithm can be stated as a recursive traversal over sub-arrays, array languages offer a lot of expressive power. The reason for this is that array shapes can be used to guide recursive traversals. Conciseness of specifications comes from array reshapes that move the desired elements into canonical hyperplanes. As a case study, we discuss several variants of implementing prefix sums (also known as scans) in SaC. We demonstrate how small code adjustments suffice to change the concurrency pattern exposed to the compiler. It turns out that variability that is typically achieved by generic inductive data types such as binary trees is a special case of what is offered by the array paradigm.},
+ booktitle = {Proceedings of the 8th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
+ pages = {1–11},
+ numpages = {11},
+ keywords = {rank polymorphism, array languages, algorithms, parallelism, functional programming, prefix sum},
+ location = {San Diego, CA, USA}, series = {ARRAY 2022}
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2021
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{SinkViesSchoARRAY21,
+ author = {Artjoms Šinkarovs and Hans Viessmann and Sven-Bodo Scholz},
+ title = {Array Languages Make Neural Networks Fast},
+ year = {2021},
+ isbn = {9781450384667},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ url = {cnn-in-sac-ARRAY21.pdf},
+ doi = {10.1145/3315454.3464312},
+ summary = {Demonstrates how SaC combines high-productivity with high-performance: less tan 100 lines for a CNN from scratch with competitive performance.},
+ abstract = {Most implementations of machine learning algorithms are based on special-purpose frameworks such as TensorFlow or PyTorch. While these frameworks are convenient to use, they introduce multi-million lines of code dependency that one has to trust, understand and potentially modify. As an alternative, this paper investigates a direct implementation of a state of the art Convolutional Neural Network (CNN) in an array language. While our implementation requires 150 lines of code to define the special-purpose operators needed for CNNs, which are readily provided through frameworks such as TensorFlow and PyTorch, our implementation out-performs these frameworks by factors 2 and 3 on a fixed set of hardware — a 64-core GPU-accelerated machine; for a simple example network. The resulting specification is written in a rank-polymorphic data-parallel style, and it can be immediately leveraged by optimising compilers. Indeed, array languages make neural networks fast.},
+ booktitle = {Proceedings of the 6th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
+ numpages = {12},
+ keywords = {machine learning,array languages},
+ category = {apps},
+ location = {Virtual,Canada},
+ series = {ARRAY 2021}
+}
+@inproceedings{CuycSchoIFL21,
+ author = {van Cuyck, Gijs and Scholz, Sven-Bodo},
+ title = {In-Place-Folding of Non-Scalar Hyper-Planes of Multi-Dimensional Arrays},
+ year = {2022},
+ isbn = {9781450386449},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ url = {CuyckScholzIFL21.pdf},
+ doi = {10.1145/3544885.3544893},
+ summary = {Doing non-scalar folds in place.},
+ category = {core, opt},
+ abstract = {Memory management plays a key role when trying to compile functional programs into efficiently executable code. In particular when using flat representations for multi-dimensional arrays, i.e., when using a single memory block for the entire data of a multi-dimensional array, in-place updates become crucial for highly competitive performance. This paper proposes a novel code generation technique for performing fold-operations on hyper-planes of multi-dimensional arrays, where the fold-operation itself operates on non-scalar sub-arrays, i.e., on vectors or higher-dimensional arrays. This technique allows for a single result array allocation over the entire folding operation without requiring the folding operation itself to be scalarised. It enables the utilisation of vector operations without any added memory allocation or copying overhead. We describe our technique in the context of SaC, sketch our implementation in the context of the compiler sac2c and provide some initial performance measurements that give an indication of the effectiveness of this new technique.},
+ booktitle = {33rd Symposium on Implementation and Application of Functional Languages},
+ pages = {29–41},
+ numpages = {13},
+ keywords = {reference counting, compiler optimisation, memory management, SaC},
+ location = {Nijmegen, Netherlands},
+ series = {IFL '21}
+}
+@inproceedings{JansSchoIFL21,
+ author = {Janssen, Niek and Scholz, Sven-Bodo},
+ title = {On Mapping N-Dimensional Data-Parallelism Efficiently into GPU-Thread-Spaces},
+ year = {2022}, isbn = {9781450386449},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ doi = {10.1145/3544885.3544894},
+ category = {par},
+ summary = {Completely revamped kernel executions based on index space transformations.},
+ abstract = {Data-Parallelism on multi-dimensional arrays can be conveniently specified as element-wise computations that are executed across n-dimensional index-spaces. While this at first glance matches very well the concept of n-dimensional thread-spaces as provided by the programming model of GPUs, in practice, a one-to-one correspondence often does not work out. This paper proposes a small set of combinators for mapping multi-dimensional index spaces onto multi-dimensional thread-index spaces suitable for execution on GPUs. For each combinator, we provide an inverse operation, which allows the original indices to be recovered within the individual threads. This setup facilitates arbitrary n-dimensional array computations to be executed on GPUs with arbitrary thread-space constraints provided the overall resources required for the computation do not exceed those of the GPU.},
+ booktitle = {33rd Symposium on Implementation and Application of Functional Languages},
+ pages = {54–66},
+ numpages = {13},
+ keywords = {SaC, array transformations, functional languages, GPU, CUDA},
+ location = {Nijmegen, Netherlands},
+ series = {IFL '21}
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2020
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{ViesSchoIFL20,
+author = {Vie\ss{}mann, Hans-Nikolai and Scholz, Sven-Bodo},
+title = {Effective Host-GPU Memory Management Through Code Generation},
+year = {2021}, isbn = {9781450389631},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+doi = {10.1145/3462172.3462199},
+abstract = {NVIDIA’s CUDA provides several options to orchestrate the management of host and device memory as well as the communication between them. In this paper we look at these choices, identify the program changes required when switching between them, and we observe their effect on application performance. We present code generation schemes that translate resource-agnostic program specifications, i.e., programs without any explicit notion of memory or GPU kernels, into five CUDA versions that differ in the use of the memory and communication API of CUDA only. An implementation of these code generators within the compiler of the functional programming language Single-Assignment C (SaC) shows performance differences between the variants by up to a factor of 3. Performance analyses reveal that the preferred choices depend on a combination of several factors, including the actual hardware being used, and several aspects of the application itself. A clear choice, therefore, cannot be made a priori. Instead, it seems essential that different variants can be generated from a single source for achieving performance portability across GPU devices.},
+booktitle = {Proceedings of the 32nd Symposium on Implementation and Application of Functional Languages},
+pages = {138–149},
+numpages = {12},
+keywords = {GPU, communication models, SaC, memory management, code generation, CUDA, transfer bandwidth},
+location = {Canterbury, United Kingdom},
+series = {IFL 2020}
+}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% 2019
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{SchoShinIFL19,
+ author = {Sven-Bodo Scholz and Artjoms Šinkarovs},
+ title = {Tensor Comprehensions in {SaC}},
+ year = {2019},
+ isbn = {9781450375627},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ booktitle = {Proceedings of the 31st Symposium on the Implementation and Application of Functional Programming Languages},
+ series = {IFL '19},
+ articleno = {15},
+ numpages = {13},
+ doi = {10.1145/3412932.3412947},
+ url = {tensor-comprehensions-IFL19.pdf},
+ summary = {Motivation,syntax, and semantics of Tensor Comprehensions in SaC.},
+ abstract = {We propose a new notation for data parallel operators on multi- dimensional arrays named tensor comprehensions. This notation combines the basic principle of array-comprehensions with syn- tactical shortcuts very close to those found in the so-called Tensor Notations used in Physics and Mathematics. As a result, complex operators with rich semantics can be defined concisely. The key to this conciseness lies in the ability to define shape-polymorphic operations combined with the ability to infer array shapes from the immediate context. The paper provides a definition of the pro- posed notation, a formal shape inference process, as well as a set of re-write rules that translates tensor comprehensions as a zero-cost syntactic sugar into standard SaC expressions.},
+ keywords = {Array comprehensions,
+ Call by value, Array programming, Functional languages},
+ category  = {core,design},
+ location = {Singapore}
+}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ Line 54: / Line 313: @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@inproceedings{ViesSinkSchoIFL18,
+author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
+title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations},
+year = {2018},
+isbn = {9781450371438},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+doi = {10.1145/3310232.3310242},
+summary = {Explains all details of the optimisation EMR.},
+url = {extended-memory-reuse-IFL18.pdf},
+abstract = {In this paper we present an optimisation for reference counting based garbage collection. The optimisation aims at reducing the total number of calls to the heap manager while preserving the key benefits of reference counting, i.e. the opportunities for in-place updates as well as memory deallocation without global garbage collection. The key idea is to carefully extend the lifetime of variables so that memory deallocations followed by memory allocations of the same size can be replaced by a direct memory reuse. Such memory reuse turns out particularly useful in the context of innermost loops of compute-intensive applications. It leads to a runtime behaviour that performs pointer swaps between buffers in the same way it would be implemented manually in languages that require explicit memory management, e.g. C.We have implemented the proposed optimisation in the context of the Single-Assignment C compiler tool chain. The paper provides an algorithmic description of our optimisation and an evaluation of its effectiveness over a collection of benchmarks including a subset of the Rodinia benchmarks and the NAS Parallel Benchmarks. We show that for several benchmarks with allocations within loops our optimisation reduces the amount of allocations by a few orders of magnitude. We also observe no negative impact on the overall memory footprint nor on the overall runtime. Instead, for some sequential executions we find mild improvement, and on GPU devices we observe speedups of up to a factor of 4x.},
+booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages},
+pages = {107–118},
+numpages = {12},
+category = {core,opts},
+keywords = {reference counting, memory management, compiler optimisation},
+location = {Lowell, MA, USA},
+series = {IFL 2018}
+}
 @inproceedings{SinkBernViesSchoARRAY18,
@@ Line 708: / Line 986: @@
 % 2007
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 @Article{TrojGrelNWPT07,
@@ Line 724: / Line 1001: @@
   CONTENTS  = {},
   sourceURL = {},
-  TOPICS    = {}
+  TOPICS    = {},
   summary    = {},
   abstract   = { The array programming paradigm adopts multidimensional arrays as the fundamental data structures of computation. Array operations process entire arrays instead of just single elements. This makes array programs highly expressive and introduces data parallelism in a natural way. Array programming imposes non-trivial structural constraints on ranks, shapes, and element values of arrays. A prominent example of such violations are out-of-bound array accesses. Usually, such constraints are enforced by means of run time checks. Both the run time overhead inflicted by dynamic constraint checking and the uncertainty of proper program evaluation are undesirable. In this paper, we propose a novel type system for array programs based on dependent types. Our type system makes dynamic constraint checks obsolete and 3guarantees orderly evaluation of well-typed programs. We employ integer vectors of statically unknown length to index array types. We also show how constraints on these vectors are resolved using a suitable reduction to integer scalars. Our presentation is based on a functional array calculus that captures the essence of the paradigm without the legacy and obfuscation of a fully-fledged array programming language.},
   category   = {types},
   doi        = {http://dx.doi.org/10.1016/j.jlap.2009.03.002},
-  url        = {dtapdgw.pdf}
+  url        = {dtapdgw.pdf},
 }
@@ Line 807: / Line 1084: @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-@InProceedings{BerneckyIFL06,
+@inproceedings{BerneckyIFL06,
-  author    = {Robert Bernecky},
+  title        = {Shape Cliques},
-  title     = {Shape Cliques},
+  author       = {Robert Bernecky},
-  booktitle = {18th International Symposium on Implementation and Application of Functional Languages (IFL'06), Budapest, Hungary},
+  year         = 2006,
-  year      = {2006},
+  booktitle    = {18th International Symposium on Implementation and Application of Functional Languages (IFL'06), Budapest, Hungary},
-  editor    = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
+  publisher    = {E\"otv\"os Lor\'and University, Faculty of Informatics, Budapest, Hungary},
-  series    = {Technical Report 2006-S01},
+  series       = {Technical Report 2006-S01},
-  pages     = {1--12},
+  pages        = {1--12},
-  publisher = {E\"otv\"os Lor\'and University, Faculty of Informatics, Budapest, Hungary},
+  editor       = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
-  topics    = {SAC,APEX,APL,Shape Cliques,Shape Inference},
+  topics       = {SAC,APEX,APL,Shape Cliques,Shape Inference},
-  summary    = {},
+  abstract     = {We introduce shape cliques, a simple way to organize a subset of the arrays appearing in an array-language-based application into sets of identically shaped arrays- shape cliques- and show how a compiler can analyze an application to infer membership in those cliques. We describe an algorithm for performing shape clique inference (SCI), and demonstrate that shape cliques can improve the performance of generated code, by permitting extension of an optimization for removal of run-time checks, and by extending the set of arrays to which optimizations, such as Index Vector Elimination (IVE), can be applied. Implementation of SCI in the APEX APL compiler permitted removal of 25 \% of run-time checks remaining on 156 benchmarks remaining after other compiler optimizations had eliminated 72 \% of the 1251 checks present in the original code. In the SAC compiler, IVE using SCI produced typical speedups of 2--14X on benchmarks operating on arrays of non- xed rank and shape, compared to the operation of IVE in a non-SCI environment. Shape clique inference data can be exploited to allow certain other optimizations, such as loop fusion and with-loop folding, to be performed on arrays of statically unknown shape and rank, with the potential for signi cant reductions in execution time.},
-  abstract   = {We introduce shape cliques, a simple way to organize a subset of the arrays appearing in an array-language-based application into sets of identically shaped arrays- shape cliques- and show how a compiler can analyze an application to infer membership in those cliques. We describe an algorithm for performing shape clique inference (SCI), and demonstrate that shape cliques can improve the performance of generated code, by permitting extension of an optimization for removal of run-time checks, and by extending the set of arrays to which optimizations, such as Index Vector Elimination (IVE), can be applied. Implementation of SCI in the APEX APL compiler permitted removal of 25 % of run-time checks remaining on 156 benchmarks remaining after other compiler optimizations had eliminated 72 % of the 1251 checks present in the original code. In the SAC compiler, IVE using SCI produced typical speedups of 2{14X on benchmarks operating on arrays of non- xed rank and shape, compared to the operation of IVE in a non-SCI environment. Shape clique inference data can be exploited to allow certain other optimizations, such as loop fusion and with-loop folding, to be performed on arrays of statically unknown shape and rank, with the potential for signi cant reductions in execution time.},
   category   = {opt},
   doi        = {},
-  url        = {sc.pdf}
+  url        = {sc.pdf},
 }
 @InProceedings{HerhSchoTFP06,
@@ Line 839: / Line 1114: @@
   sourceURL = {},
   TOPICS    = {SAC},
-  AFFIL     = {ctca}
+  AFFIL     = {ctca},
   summary    = {},
   abstract   = {In this paper we propose a new means to model and operate on nested arrays that allows for a high level of abstraction without introducing a performance penalty. We achieve this by using a nesting structure on array types which allows us to shift the nesting information of arrays from the runtime representation level to the type system level. This information can then be exploited for generic function definitions on the nesting structure of arrays which, as we show, neatly integrates with subtyping based function overloading. Finally, we demonstrate for an example how nested arrays and generic function definitions can be fully stripped out using existing optimisation techniques},
@@ Line 922: / Line 1197: @@
   category   = {opt},
   doi        = {10.1007/11964681_13},
-  url        = {absafgpoa.pdf}
+  url        = {absafgpoa.pdf},
 }
@@ Line 1145: / Line 1420: @@
   category   = {core, design},
   doi        = {10.1.1.138.6995},
-  url        = {SACESFHLAOIAFS.pdf}
+  url        = {SACESFHLAOIAFS.pdf},
 }
@@ Line 1226: / Line 1501: @@
   category   = {core, opt},
   doi        = {10.1.1.138.8533},
-  url        = {ACSFAHOAT.pdf}
+  url        = {ACSFAHOAT.pdf},
 }
@@ Line 1247: / Line 1522: @@
   category   = {},
   doi        = {},
-  url        = {}
+  url        = {},
 }