Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
tile_traits_standard.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/tile_iterator.h"
32 
33 namespace cutlass {
34 
36 
38 template <typename ThreadShape>
42  Coord<4> operator()() const {
43  Coord<4> thread_offset;
44 
45  int index = threadIdx.x;
46 
47  thread_offset[3] = (index % ThreadShape::kC);
48  index = (index / ThreadShape::kC);
49 
50  thread_offset[2] = (index % ThreadShape::kW);
51  index = (index / ThreadShape::kW);
52 
53  thread_offset[1] = (index % ThreadShape::kH);
54  index = (index / ThreadShape::kH);
55 
56  thread_offset[0] = index;
57 
58  return thread_offset;
59  }
60 };
61 
63 
66 template <typename Tile_, int Threads>
69  typedef Tile_ Tile;
70 
72  static int const kThreads = Threads;
73 
74  // Static assertions
76  "Tiling undefined if elements not divisible by threads.");
77 
78  static_assert(Tile::kW <= kThreads,
79  "This specialization assumes there are more threads than the contiguous dimension "
80  "of the tile.");
81 
83  typedef Shape<1, kThreads / Tile::kW, Tile::kW, 1> ThreadShape;
84 
87 
89  typedef Shape<1, Tile::kH / ThreadShape::kH, 1, 1> Iterations;
90 
93 };
94 
96 
99 template <typename Tile_, int Threads>
102  typedef Tile_ Tile;
103 
105  static int const kThreads = Threads;
106 
107  // Static assertions
108  static_assert(Tile::kW >= kThreads,
109  "This specialization assumes there are more threads than the contiguous dimension "
110  "of the tile.");
111 
113  "Tiling undefined if elements not divisible by threads.");
114 
115  static_assert(!(Tile::kW % kThreads),
116  "The contiguous size of the tile must be divisible by the number of threads.");
117 
120 
123 
125  typedef Shape<1, Tile::kH, Tile::kW / kThreads> Iterations;
126 
129 };
130 
132 
134 template <typename Tile_, int Threads>
137  typedef Tile_ Tile;
138 
140  static int const kThreads = Threads;
141 
143  static int const kWarpSize = 32;
144 
146  static int const kWarpCount = kThreads / kWarpSize;
147 
148  // Static assertions
150  "Tiling undefined if elements not divisible by threads.");
151 
152  static_assert(!(kThreads % kWarpSize), "Number of threads must be divisible by the warp size.");
153 
154  static_assert(!(Tile::kW % kWarpSize), "Contiguous dimension must be divisible by the warp size");
155 
157  static int const kWarpsStrided = __NV_STD_MIN(kWarpCount, Tile::kH);
158 
161 
164 
167 
169  typedef Shape<1, Tile::kH / Delta::kH, Tile::kW / ThreadShape::kW> Iterations;
170 
172  struct ThreadOffset {
176  int tid = threadIdx.x;
177  int warp = (tid / kWarpSize);
178  int lane = (tid % kWarpSize);
179 
180  static int const kWarpSpanContiguous = kWarpSize * Iterations::kW;
181 
182  int warp_w = (warp % kWarpsContiguous);
183  int warp_h = (warp / kWarpsContiguous);
184 
185  return make_Coord(0, warp_h, lane + kWarpSpanContiguous * warp_w, 0);
186  }
187  };
188 };
189 
191 
193 template <typename Tile_, int Threads>
196  typedef Tile_ Tile;
197 
199  static int const kThreads = Threads;
200 
202  static int const kWarpSize = 32;
203 
205  static int const kWarpCount = kThreads / kWarpSize;
206 
208  static int const kAccessSize = 1;
209 
210  // Static assertions
212  "Tiling undefined if elements not divisible by threads.");
213 
217  typedef typename platform::conditional <
218  Tile::kW<kWarpSize,
220  typename platform::conditional<!(Tile::kW % kWarpSize),
223  type Traits;
224 
226  typedef typename Traits::Delta Delta;
227 
229  typedef Shape<0, 0, 0, 0> ImmediateOffsetStrides;
230 
232  typedef typename Traits::Iterations Iterations;
233 
235  typedef typename Traits::ThreadOffset ThreadOffset;
236 };
237 
239 
240 } // namespace cutlass
Shape< 1, Tile::kH/Delta::kH, Tile::kW/ThreadShape::kW > Iterations
Number of iterations.
Definition: tile_traits_standard.h:169
Definition: convert.h:33
Shape< 1, Tile::kH/ThreadShape::kH, 1, 1 > Iterations
Number of iterations.
Definition: tile_traits_standard.h:89
static int const kWarpCount
Number of participating warps.
Definition: tile_traits_standard.h:205
Definition: tile_traits_standard.h:100
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.
static int const kWarpsStrided
Warps strip-mined across strided dimension.
Definition: tile_traits_standard.h:157
static int const kThreads
Number of participating threads.
Definition: tile_traits_standard.h:105
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:368
Computes the thread offset in (H, W) based on thread ID.
Definition: tile_traits_standard.h:172
static int const kThreads
Number of participating threads.
Definition: tile_traits_standard.h:72
Chooses &#39;best&#39; shape to enable warp raking along contiguous dimension if possible.
Definition: tile_traits_standard.h:194
Tile_ Tile
Shape of tile.
Definition: tile_traits_standard.h:137
static int const kWarpsContiguous
Warps stripmined contiguous dimension.
Definition: tile_traits_standard.h:160
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Computes the logical coordinate from thread shape.
Definition: tile_traits_standard.h:42
Shape< 1, kWarpsStrided, kWarpSize > Delta
The same warp rakes along the contiguous dimension.
Definition: tile_traits_standard.h:166
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Basic thread offset function computed from a thread shape.
Definition: tile_traits_standard.h:175
Basic thread offset function computed from a thread shape.
Definition: tile_traits_standard.h:39
static int const kH
The height of the cube.
Definition: shape.h:68
static int const kThreads
Number of participating threads.
Definition: tile_traits_standard.h:140
Shape< 1, ThreadShape::kH, 1, 1 > Delta
Delta along each dimension.
Definition: tile_traits_standard.h:86
Shape< 1, kThreads/Tile::kW, Tile::kW, 1 > ThreadShape
Shape of threads.
Definition: tile_traits_standard.h:76
static int const kWarpSize
Hard-coded warp size.
Definition: tile_traits_standard.h:143
#define __NV_STD_MIN(a, b)
Select minimum(a, b)
Definition: platform.h:168
static int const kAccessSize
By default, do not do scalar loads.
Definition: tile_traits_standard.h:208
Tile_ Tile
Shape of tile.
Definition: tile_traits_standard.h:196
Tile_ Tile
Shape of tile.
Definition: tile_traits_standard.h:69
static int const kWarpCount
Number of participating warps.
Definition: tile_traits_standard.h:146
Shape< 1, kWarpsStrided, kWarpsContiguous *kWarpSize > ThreadShape
Arrangement of threads.
Definition: tile_traits_standard.h:163
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
Definition: tile_traits_standard.h:67
std::conditional (true specialization)
Definition: platform.h:351
#define static_assert(__e, __m)
Definition: platform.h:153
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
TiledThreadOffset< ThreadShape > ThreadOffset
Computes the initial offset.
Definition: tile_traits_standard.h:92
Tile_ Tile
Shape of tile.
Definition: tile_traits_standard.h:102
static int const kW
The width of the cube.
Definition: shape.h:70
Tiling in which warps rake across the contiguous dimension.
Definition: tile_traits_standard.h:135
static int const kWarpSize
Hard-coded warp size.
Definition: tile_traits_standard.h:202
Shape< 1, 1, kThreads > Delta
Delta between each thread&#39;s access.
Definition: tile_traits_standard.h:122
Shape< 1, 1, kThreads > ThreadShape
Thread shape.
Definition: tile_traits_standard.h:110
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79
TiledThreadOffset< ThreadShape > ThreadOffset
Computes the initial offset.
Definition: tile_traits_standard.h:128
static int const kThreads
Number of participating threads.
Definition: tile_traits_standard.h:199
Shape< 1, Tile::kH, Tile::kW/kThreads > Iterations
Number of iterations.
Definition: tile_traits_standard.h:125