Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
hdfs.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_IO_HDFS
19 #define ARROW_IO_HDFS
20 
21 #include <cstdint>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "arrow/io/interfaces.h"
27 #include "arrow/util/macros.h"
28 #include "arrow/util/visibility.h"
29 
30 namespace arrow {
31 
32 class Buffer;
33 class MemoryPool;
34 class Status;
35 
36 namespace io {
37 
38 class HdfsReadableFile;
39 class HdfsOutputStream;
40 
41 struct HdfsPathInfo {
43 
44  std::string name;
45  std::string owner;
46  std::string group;
47 
48  // Access times in UNIX timestamps (seconds)
49  int64_t size;
50  int64_t block_size;
51 
54 
55  int16_t replication;
56  int16_t permissions;
57 };
58 
59 enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 };
60 
62  std::string host;
63  int port;
64  std::string user;
65  std::string kerb_ticket;
67 };
68 
69 class ARROW_EXPORT HadoopFileSystem : public FileSystem {
70  public:
72 
73  // Connect to an HDFS cluster given a configuration
74  //
75  // @param config (in): configuration for connecting
76  // @param fs (out): the created client
77  // @returns Status
78  static Status Connect(const HdfsConnectionConfig* config,
79  std::shared_ptr<HadoopFileSystem>* fs);
80 
81  // Create directory and all parents
82  //
83  // @param path (in): absolute HDFS path
84  // @returns Status
85  Status MakeDirectory(const std::string& path) override;
86 
87  // Delete file or directory
88  // @param path: absolute path to data
89  // @param recursive: if path is a directory, delete contents as well
90  // @returns error status on failure
91  Status Delete(const std::string& path, bool recursive = false);
92 
93  Status DeleteDirectory(const std::string& path) override;
94 
95  // Disconnect from cluster
96  //
97  // @returns Status
98  Status Disconnect();
99 
100  // @param path (in): absolute HDFS path
101  // @returns bool, true if the path exists, false if not (or on error)
102  bool Exists(const std::string& path);
103 
104  // @param path (in): absolute HDFS path
105  // @param info (out)
106  // @returns Status
107  Status GetPathInfo(const std::string& path, HdfsPathInfo* info);
108 
109  // @param nbytes (out): total capacity of the filesystem
110  // @returns Status
111  Status GetCapacity(int64_t* nbytes);
112 
113  // @param nbytes (out): total bytes used of the filesystem
114  // @returns Status
115  Status GetUsed(int64_t* nbytes);
116 
117  Status GetChildren(const std::string& path, std::vector<std::string>* listing) override;
118 
119  Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing);
120 
126  Status Chown(const std::string& path, const char* owner, const char* group);
127 
133  Status Chmod(const std::string& path, int mode);
134 
135  // Move file or directory from source path to destination path within the
136  // current filesystem
137  Status Rename(const std::string& src, const std::string& dst) override;
138 
139  Status Stat(const std::string& path, FileStatistics* stat) override;
140 
141  // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory
142 
143  // Open an HDFS file in READ mode. Returns error
144  // status if the file is not found.
145  //
146  // @param path complete file path
147  Status OpenReadable(const std::string& path, int32_t buffer_size,
148  std::shared_ptr<HdfsReadableFile>* file);
149 
150  Status OpenReadable(const std::string& path, std::shared_ptr<HdfsReadableFile>* file);
151 
152  // FileMode::WRITE options
153  // @param path complete file path
154  // @param buffer_size, 0 for default
155  // @param replication, 0 for default
156  // @param default_block_size, 0 for default
157  Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size,
158  int16_t replication, int64_t default_block_size,
159  std::shared_ptr<HdfsOutputStream>* file);
160 
161  Status OpenWriteable(const std::string& path, bool append,
162  std::shared_ptr<HdfsOutputStream>* file);
163 
164  private:
165  friend class HdfsReadableFile;
166  friend class HdfsOutputStream;
167 
168  class ARROW_NO_EXPORT HadoopFileSystemImpl;
169  std::unique_ptr<HadoopFileSystemImpl> impl_;
170 
173 };
174 
175 class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile {
176  public:
177  ~HdfsReadableFile();
178 
179  Status Close() override;
180 
181  Status GetSize(int64_t* size) override;
182 
183  // NOTE: If you wish to read a particular range of a file in a multithreaded
184  // context, you may prefer to use ReadAt to avoid locking issues
185  Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override;
186 
187  Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override;
188 
189  Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read,
190  uint8_t* buffer) override;
191 
192  Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr<Buffer>* out) override;
193 
194  bool supports_zero_copy() const override;
195 
196  Status Seek(int64_t position) override;
197  Status Tell(int64_t* position) const override;
198 
199  void set_memory_pool(MemoryPool* pool);
200 
201  private:
202  explicit HdfsReadableFile(MemoryPool* pool = NULLPTR);
203 
204  class ARROW_NO_EXPORT HdfsReadableFileImpl;
205  std::unique_ptr<HdfsReadableFileImpl> impl_;
206 
207  friend class HadoopFileSystem::HadoopFileSystemImpl;
208 
210 };
211 
212 // Naming this file OutputStream because it does not support seeking (like the
213 // WriteableFile interface)
214 class ARROW_EXPORT HdfsOutputStream : public OutputStream {
215  public:
216  ~HdfsOutputStream();
217 
218  Status Close() override;
219 
220  Status Write(const uint8_t* buffer, int64_t nbytes) override;
221 
222  Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written);
223 
224  Status Flush() override;
225 
226  Status Tell(int64_t* position) const override;
227 
228  private:
229  class ARROW_NO_EXPORT HdfsOutputStreamImpl;
230  std::unique_ptr<HdfsOutputStreamImpl> impl_;
231 
232  friend class HadoopFileSystem::HadoopFileSystemImpl;
233 
235 
237 };
238 
239 Status ARROW_EXPORT HaveLibHdfs();
240 Status ARROW_EXPORT HaveLibHdfs3();
241 
242 } // namespace io
243 } // namespace arrow
244 
245 #endif // ARROW_IO_HDFS
int32_t last_modified_time
Definition: hdfs.h:52
Definition: hdfs.h:69
int32_t last_access_time
Definition: hdfs.h:53
Definition: interfaces.h:111
Definition: interfaces.h:44
type
Definition: interfaces.h:41
Definition: hdfs.h:61
Status HaveLibHdfs()
#define ARROW_NO_EXPORT
Definition: visibility.h:40
Definition: interfaces.h:121
int64_t block_size
Definition: hdfs.h:50
#define NULLPTR
Definition: macros.h:69
int64_t size
Definition: hdfs.h:49
HdfsDriver driver
Definition: hdfs.h:66
ObjectType::type kind
Definition: hdfs.h:42
Definition: status.h:106
HdfsDriver
Definition: hdfs.h:59
std::string host
Definition: hdfs.h:62
std::string group
Definition: hdfs.h:46
std::string kerb_ticket
Definition: hdfs.h:65
Definition: hdfs.h:214
int16_t permissions
Definition: hdfs.h:56
Definition: hdfs.h:41
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
int port
Definition: hdfs.h:63
Definition: interfaces.h:50
int16_t replication
Definition: hdfs.h:55
std::string name
Definition: hdfs.h:44
Definition: hdfs.h:175
std::string user
Definition: hdfs.h:64
Base class for memory allocation.
Definition: memory_pool.h:34
Status HaveLibHdfs3()
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23
std::string owner
Definition: hdfs.h:45