Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
memory.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_UTIL_MEMORY_H
19 #define ARROW_UTIL_MEMORY_H
20 
21 #include <thread>
22 #include <vector>
23 
24 namespace arrow {
25 namespace internal {
26 
27 uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) {
28  uintptr_t value = reinterpret_cast<uintptr_t>(address);
29  return reinterpret_cast<uint8_t*>(value & bits);
30 }
31 
32 // A helper function for doing memcpy with multiple threads. This is required
33 // to saturate the memory bandwidth of modern cpus.
34 void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
35  uintptr_t block_size, int num_threads) {
36  std::vector<std::thread> threadpool(num_threads);
37  uint8_t* left = pointer_logical_and(src + block_size - 1, ~(block_size - 1));
38  uint8_t* right = pointer_logical_and(src + nbytes, ~(block_size - 1));
39  int64_t num_blocks = (right - left) / block_size;
40 
41  // Update right address
42  right = right - (num_blocks % num_threads) * block_size;
43 
44  // Now we divide these blocks between available threads. The remainder is
45  // handled on the main thread.
46  int64_t chunk_size = (right - left) / num_threads;
47  int64_t prefix = left - src;
48  int64_t suffix = src + nbytes - right;
49  // Now the data layout is | prefix | k * num_threads * block_size | suffix |.
50  // We have chunk_size = k * block_size, therefore the data layout is
51  // | prefix | num_threads * chunk_size | suffix |.
52  // Each thread gets a "chunk" of k blocks.
53 
54  // Start all threads first and handle leftovers while threads run.
55  for (int i = 0; i < num_threads; i++) {
56  threadpool[i] = std::thread(memcpy, dst + prefix + i * chunk_size,
57  left + i * chunk_size, chunk_size);
58  }
59 
60  memcpy(dst, src, prefix);
61  memcpy(dst + prefix + num_threads * chunk_size, right, suffix);
62 
63  for (auto& t : threadpool) {
64  if (t.joinable()) {
65  t.join();
66  }
67  }
68 }
69 
70 } // namespace internal
71 } // namespace arrow
72 
73 #endif // ARROW_UTIL_MEMORY_H
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29