Open3D (C++ API)  0.17.0
Loading...
Searching...
No Matches
StdGPUHashBackend.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2023 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7
8#pragma once
9
10#include <stdgpu/memory.h>
11#include <thrust/device_vector.h>
12#include <thrust/transform.h>
13
14#include <stdgpu/unordered_map.cuh>
15#include <type_traits>
16
22
23namespace open3d {
24namespace core {
25
31template <typename T>
33public:
35 using value_type = T;
36
38 StdGPUAllocator() = default;
39
41 explicit StdGPUAllocator(int device_id) : std_allocator_(device_id) {}
42
45
48
51
54
56 template <typename U>
58 : std_allocator_(other.std_allocator_) {}
59
61 T* allocate(std::size_t n) {
62 T* p = std_allocator_.allocate(n);
63 stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
64 return p;
65 }
66
68 void deallocate(T* p, std::size_t n) {
69 stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
70 std_allocator_.deallocate(p, n);
71 }
72
74 bool operator==(const StdGPUAllocator& other) {
75 return std_allocator_ == other.std_allocator_;
76 }
77
79 bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
80
81private:
82 // Allow access in rebind constructor.
83 template <typename T2>
84 friend class StdGPUAllocator;
85
86 StdAllocator<T> std_allocator_;
87};
88
89// These typedefs must be defined outside of StdGPUHashBackend to make them
90// accessible in raw CUDA kernels.
91template <typename Key>
94
95template <typename Key, typename Hash, typename Eq>
97 stdgpu::unordered_map<Key,
99 Hash,
100 Eq,
102
103template <typename Key, typename Hash, typename Eq>
105public:
106 StdGPUHashBackend(int64_t init_capacity,
107 int64_t key_dsize,
108 const std::vector<int64_t>& value_dsizes,
109 const Device& device);
111
112 void Reserve(int64_t capacity) override;
113
114 void Insert(const void* input_keys,
115 const std::vector<const void*>& input_values_soa,
116 buf_index_t* output_buf_indices,
117 bool* output_masks,
118 int64_t count) override;
119
120 void Find(const void* input_keys,
121 buf_index_t* output_buf_indices,
122 bool* output_masks,
123 int64_t count) override;
124
125 void Erase(const void* input_keys,
126 bool* output_masks,
127 int64_t count) override;
128
129 int64_t GetActiveIndices(buf_index_t* output_indices) override;
130
131 void Clear() override;
132
133 int64_t Size() const override;
134
135 int64_t GetBucketCount() const override;
136 std::vector<int64_t> BucketSizes() const override;
137 float LoadFactor() const override;
138
140
141 void Allocate(int64_t capacity);
142 void Free();
143
144protected:
145 // Use reference, since the structure itself is implicitly handled as a
146 // pointer directly by stdgpu.
148
150};
151
152template <typename Key, typename Hash, typename Eq>
154 int64_t init_capacity,
155 int64_t key_dsize,
156 const std::vector<int64_t>& value_dsizes,
157 const Device& device)
158 : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
159 CUDAScopedDevice scoped_device(this->device_);
160 Allocate(init_capacity);
161}
162
163template <typename Key, typename Hash, typename Eq>
165 CUDAScopedDevice scoped_device(this->device_);
166 Free();
167}
168
169template <typename Key, typename Hash, typename Eq>
171 CUDAScopedDevice scoped_device(this->device_);
172 return impl_.size();
173}
174
175// Need an explicit kernel for non-const access to map
176template <typename Key, typename Hash, typename Eq>
178 CUDAHashBackendBufferAccessor buffer_accessor,
179 const Key* input_keys,
180 buf_index_t* output_buf_indices,
181 bool* output_masks,
182 int64_t count) {
183 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
184 if (tid >= count) return;
185
186 Key key = input_keys[tid];
187 auto iter = map.find(key);
188 bool flag = (iter != map.end());
189 output_masks[tid] = flag;
190 output_buf_indices[tid] = flag ? iter->second : 0;
191}
192
193template <typename Key, typename Hash, typename Eq>
194void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
195 buf_index_t* output_buf_indices,
196 bool* output_masks,
197 int64_t count) {
198 CUDAScopedDevice scoped_device(this->device_);
199 uint32_t threads = 128;
200 uint32_t blocks = (count + threads - 1) / threads;
201
202 STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
203 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
204 output_buf_indices, output_masks, count);
205 cuda::Synchronize(this->device_);
206}
207
208// Need an explicit kernel for non-const access to map
209template <typename Key, typename Hash, typename Eq>
211 CUDAHashBackendBufferAccessor buffer_accessor,
212 const Key* input_keys,
213 buf_index_t* output_buf_indices,
214 bool* output_masks,
215 int64_t count) {
216 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
217 if (tid >= count) return;
218
219 Key key = input_keys[tid];
220 auto iter = map.find(key);
221 bool flag = (iter != map.end());
222 output_masks[tid] = flag;
223 output_buf_indices[tid] = flag ? iter->second : 0;
224
225 if (output_masks[tid]) {
226 output_masks[tid] = map.erase(key);
227 if (output_masks[tid]) {
228 buffer_accessor.DeviceFree(output_buf_indices[tid]);
229 }
230 }
231}
232
233template <typename Key, typename Hash, typename Eq>
234void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
235 bool* output_masks,
236 int64_t count) {
237 CUDAScopedDevice scoped_device(this->device_);
238 uint32_t threads = 128;
239 uint32_t blocks = (count + threads - 1) / threads;
240
241 core::Tensor toutput_buf_indices =
242 core::Tensor({count}, core::Int32, this->device_);
243 buf_index_t* output_buf_indices =
244 static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
245
246 STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
247 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
248 output_buf_indices, output_masks, count);
249 cuda::Synchronize(this->device_);
250}
251
252template <typename Key>
255 operator()(const thrust::pair<Key, buf_index_t>& x) const {
256 return x.second;
257 }
258};
259
260template <typename Key, typename Hash, typename Eq>
262 buf_index_t* output_indices) {
263 CUDAScopedDevice scoped_device(this->device_);
264 auto range = impl_.device_range();
265
266 thrust::transform(range.begin(), range.end(), output_indices,
268
269 return impl_.size();
270}
271
272template <typename Key, typename Hash, typename Eq>
274 CUDAScopedDevice scoped_device(this->device_);
275 impl_.clear();
276 this->buffer_->ResetHeap();
277}
278
279template <typename Key, typename Hash, typename Eq>
281 CUDAScopedDevice scoped_device(this->device_);
282}
283
284template <typename Key, typename Hash, typename Eq>
286 CUDAScopedDevice scoped_device(this->device_);
287 return impl_.bucket_count();
288}
289
290template <typename Key, typename Hash, typename Eq>
292 CUDAScopedDevice scoped_device(this->device_);
293 utility::LogError("Unimplemented");
294}
295
296template <typename Key, typename Hash, typename Eq>
298 CUDAScopedDevice scoped_device(this->device_);
299 return impl_.load_factor();
300}
301
302// Need an explicit kernel for non-const access to map
303template <typename Key, typename Hash, typename Eq, typename block_t>
304__global__ void STDGPUInsertKernel(
306 CUDAHashBackendBufferAccessor buffer_accessor,
307 const Key* input_keys,
308 const void* const* input_values_soa,
309 buf_index_t* output_buf_indices,
310 bool* output_masks,
311 int64_t count,
312 int64_t n_values) {
313 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
314 if (tid >= count) return;
315
316 Key key = input_keys[tid];
317 output_buf_indices[tid] = 0;
318 output_masks[tid] = false;
319
320 // First apply 'try insert' with a dummy index
321 auto res = map.emplace(key, 0);
322
323 // If success, change the iterator and provide the actual index
324 if (res.second) {
325 buf_index_t buf_index = buffer_accessor.DeviceAllocate();
326 auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
327
328 // Copy templated key to buffer (duplicate)
329 // TODO: hack stdgpu inside and take out the buffer directly
330 *static_cast<Key*>(key_ptr) = key;
331
332 // Copy/reset non-templated value in buffer
333 for (int j = 0; j < n_values; ++j) {
334 const int64_t blocks_per_element =
335 buffer_accessor.value_blocks_per_element_[j];
336
337 block_t* dst_value = static_cast<block_t*>(
338 buffer_accessor.GetValuePtr(buf_index, j));
339 const block_t* src_value =
340 static_cast<const block_t*>(input_values_soa[j]) +
341 blocks_per_element * tid;
342 for (int b = 0; b < blocks_per_element; ++b) {
343 dst_value[b] = src_value[b];
344 }
345 }
346
347 // Update from the dummy index
348 res.first->second = buf_index;
349
350 // Write to return variables
351 output_buf_indices[tid] = buf_index;
352 output_masks[tid] = true;
353 }
354}
355
356template <typename Key, typename Hash, typename Eq>
358 const void* input_keys,
359 const std::vector<const void*>& input_values_soa,
360 buf_index_t* output_buf_indices,
361 bool* output_masks,
362 int64_t count) {
363 CUDAScopedDevice scoped_device(this->device_);
364 uint32_t threads = 128;
365 uint32_t blocks = (count + threads - 1) / threads;
366
367 thrust::device_vector<const void*> input_values_soa_device(
368 input_values_soa.begin(), input_values_soa.end());
369
370 int64_t n_values = input_values_soa.size();
371 const void* const* ptr_input_values_soa =
372 thrust::raw_pointer_cast(input_values_soa_device.data());
373
374 DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
375 buffer_accessor_.common_block_size_, [&]() {
376 STDGPUInsertKernel<Key, Hash, Eq, block_t>
377 <<<blocks, threads, 0, core::cuda::GetStream()>>>(
378 impl_, buffer_accessor_,
379 static_cast<const Key*>(input_keys),
380 ptr_input_values_soa, output_buf_indices,
381 output_masks, count, n_values);
382 });
383 cuda::Synchronize(this->device_);
384}
385
386template <typename Key, typename Hash, typename Eq>
388 CUDAScopedDevice scoped_device(this->device_);
389 this->capacity_ = capacity;
390
391 // Allocate buffer for key values.
392 this->buffer_ = std::make_shared<HashBackendBuffer>(
393 this->capacity_, this->key_dsize_, this->value_dsizes_,
394 this->device_);
395 buffer_accessor_.Setup(*this->buffer_);
396
397 // stdgpu initializes on the default stream. Set the current stream to
398 // ensure correct behavior.
399 {
400 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
401
403 this->capacity_,
404 InternalStdGPUHashBackendAllocator<Key>(this->device_.GetID()));
405 cuda::Synchronize(this->device_);
406 }
407}
408
409template <typename Key, typename Hash, typename Eq>
411 CUDAScopedDevice scoped_device(this->device_);
412 // Buffer is automatically handled by the smart pointer.
413 buffer_accessor_.Shutdown(this->device_);
414
415 // stdgpu initializes on the default stream. Set the current stream to
416 // ensure correct behavior.
417 {
418 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
419
421 }
422}
423} // namespace core
424} // namespace open3d
Common CUDA utilities.
#define OPEN3D_HOST_DEVICE
Definition CUDAUtils.h:44
Definition CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition CUDAHashBackendBufferAccessor.h:108
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition CUDAHashBackendBufferAccessor.h:91
__device__ buf_index_t DeviceAllocate()
Definition CUDAHashBackendBufferAccessor.h:79
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:88
__device__ void DeviceFree(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:83
When CUDA is not enabled, this is a dummy class.
Definition CUDAUtils.h:214
Definition DeviceHashBackend.h:20
Device device_
Definition DeviceHashBackend.h:100
Definition Device.h:18
Definition StdAllocator.h:23
Definition StdGPUHashBackend.h:32
T * allocate(std::size_t n)
Allocates memory of size n.
Definition StdGPUHashBackend.h:61
StdGPUAllocator()=default
Default constructor.
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
StdGPUAllocator(int device_id)
Constructor from device.
Definition StdGPUHashBackend.h:41
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition StdGPUHashBackend.h:68
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition StdGPUHashBackend.h:74
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.
T value_type
T.
Definition StdGPUHashBackend.h:35
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition StdGPUHashBackend.h:79
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition StdGPUHashBackend.h:57
Definition StdGPUHashBackend.h:104
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition StdGPUHashBackend.h:153
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition StdGPUHashBackend.h:234
~StdGPUHashBackend()
Definition StdGPUHashBackend.h:164
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition StdGPUHashBackend.h:297
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition StdGPUHashBackend.h:139
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition StdGPUHashBackend.h:194
void Free()
Definition StdGPUHashBackend.h:410
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition StdGPUHashBackend.h:357
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition StdGPUHashBackend.h:291
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition StdGPUHashBackend.h:147
void Reserve(int64_t capacity) override
Definition StdGPUHashBackend.h:280
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition StdGPUHashBackend.h:285
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition StdGPUHashBackend.h:261
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition StdGPUHashBackend.h:170
void Allocate(int64_t capacity)
Definition StdGPUHashBackend.h:387
CUDAHashBackendBufferAccessor buffer_accessor_
Definition StdGPUHashBackend.h:149
void Clear() override
Clear stored map without reallocating memory.
Definition StdGPUHashBackend.h:273
Definition Tensor.h:32
T * GetDataPtr()
Definition Tensor.h:1133
int count
Definition FilePCD.cpp:42
void Synchronize()
Definition CUDAUtils.cpp:58
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:177
uint32_t buf_index_t
Definition HashBackendBuffer.h:44
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:210
const Dtype Int32
Definition Dtype.cpp:46
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition StdGPUHashBackend.h:101
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition StdGPUHashBackend.h:304
Definition PinholeCameraIntrinsic.cpp:16
Definition StdGPUHashBackend.h:253
OPEN3D_HOST_DEVICE buf_index_t operator()(const thrust::pair< Key, buf_index_t > &x) const
Definition StdGPUHashBackend.h:255