# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.# flake8: noqa"""PyArrow is the python implementation of Apache Arrow.Apache Arrow is a cross-language development platform for in-memory data.It specifies a standardized language-independent columnar memory format forflat and hierarchical data, organized for efficient analytic operations onmodern hardware. It also provides computational libraries and zero-copystreaming messaging and interprocess communication.For more information see the official page at https://arrow.apache.org"""importgcas_gcimportimportlibas_importlibimportosas_osimportplatformas_platformimportsysas_sysimportwarningsas_warningstry:from._generated_versionimportversionas__version__exceptImportError:# Package is not installed, parse git tag at runtimetry:importsetuptools_scm# Code duplicated from setup.py to avoid a dependency on each otherdefparse_git(root,**kwargs):""" Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """fromsetuptools_scm.gitimportparsekwargs['describe_command']= \
"git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'"returnparse(root,**kwargs)__version__=setuptools_scm.get_version('../',parse=parse_git)exceptImportError:__version__=None# ARROW-8684: Disable GC while initializing Cython extension module,# to workaround Cython bug in https://github.com/cython/cython/issues/3603_gc_enabled=_gc.isenabled()_gc.disable()importpyarrow.libas_libif_gc_enabled:_gc.enable()frompyarrow.libimport(BuildInfo,RuntimeInfo,MonthDayNano,VersionInfo,cpp_build_info,cpp_version,cpp_version_info,runtime_info,cpu_count,set_cpu_count,enable_signal_handlers,io_thread_count,set_io_thread_count)defshow_versions():""" Print various version information, to help with error reporting. """defprint_entry(label,value):print(f"{label: <26}: {value: <8}")print("pyarrow version info\n--------------------")print_entry("Package kind",cpp_build_info.package_kindiflen(cpp_build_info.package_kind)>0else"not indicated")print_entry("Arrow C++ library version",cpp_build_info.version)print_entry("Arrow C++ compiler",f"{cpp_build_info.compiler_id}{cpp_build_info.compiler_version}")print_entry("Arrow C++ compiler flags",cpp_build_info.compiler_flags)print_entry("Arrow C++ git revision",cpp_build_info.git_id)print_entry("Arrow C++ git description",cpp_build_info.git_description)print_entry("Arrow C++ build type",cpp_build_info.build_type)def_module_is_available(module):try:_importlib.import_module(f'pyarrow.{module}')exceptImportError:returnFalseelse:returnTruedef_filesystem_is_available(fs):try:importpyarrow.fsexceptImportError:returnFalsetry:getattr(pyarrow.fs,fs)except(ImportError,AttributeError):returnFalseelse:returnTruedefshow_info():""" Print detailed version and platform information, for error reporting """show_versions()defprint_entry(label,value):print(f" {label: <20}: {value: <8}")print("\nPlatform:")print_entry("OS / Arch",f"{_platform.system()}{_platform.machine()}")print_entry("SIMD Level",runtime_info().simd_level)print_entry("Detected SIMD Level",runtime_info().detected_simd_level)pool=default_memory_pool()print("\nMemory:")print_entry("Default backend",pool.backend_name)print_entry("Bytes allocated",f"{pool.bytes_allocated()} bytes")print_entry("Max memory",f"{pool.max_memory()} bytes")print_entry("Supported Backends",', '.join(supported_memory_backends()))print("\nOptional modules:")modules=["csv","cuda","dataset","feather","flight","fs","gandiva","json","orc","parquet"]formoduleinmodules:status="Enabled"if_module_is_available(module)else"-"print(f" {module: <20}: {status: <8}")print("\nFilesystems:")filesystems=["GcsFileSystem","HadoopFileSystem","S3FileSystem"]forfsinfilesystems:status="Enabled"if_filesystem_is_available(fs)else"-"print(f" {fs: <20}: {status: <8}")print("\nCompression Codecs:")codecs=["brotli","bz2","gzip","lz4_frame","lz4","snappy","zstd"]forcodecincodecs:status="Enabled"ifCodec.is_available(codec)else"-"print(f" {codec: <20}: {status: <8}")frompyarrow.libimport(null,bool_,int8,int16,int32,int64,uint8,uint16,uint32,uint64,time32,time64,timestamp,date32,date64,duration,month_day_nano_interval,float16,float32,float64,binary,string,utf8,large_binary,large_string,large_utf8,decimal128,decimal256,list_,large_list,map_,struct,union,sparse_union,dense_union,dictionary,run_end_encoded,fixed_shape_tensor,field,type_for_alias,DataType,DictionaryType,StructType,ListType,LargeListType,MapType,FixedSizeListType,UnionType,SparseUnionType,DenseUnionType,TimestampType,Time32Type,Time64Type,DurationType,FixedSizeBinaryType,Decimal128Type,Decimal256Type,BaseExtensionType,ExtensionType,RunEndEncodedType,FixedShapeTensorType,PyExtensionType,UnknownExtensionType,register_extension_type,unregister_extension_type,DictionaryMemo,KeyValueMetadata,Field,Schema,schema,unify_schemas,Array,Tensor,array,chunked_array,record_batch,nulls,repeat,SparseCOOTensor,SparseCSRMatrix,SparseCSCMatrix,SparseCSFTensor,infer_type,from_numpy_dtype,NullArray,NumericArray,IntegerArray,FloatingPointArray,BooleanArray,Int8Array,UInt8Array,Int16Array,UInt16Array,Int32Array,UInt32Array,Int64Array,UInt64Array,HalfFloatArray,FloatArray,DoubleArray,ListArray,LargeListArray,MapArray,FixedSizeListArray,UnionArray,BinaryArray,StringArray,LargeBinaryArray,LargeStringArray,FixedSizeBinaryArray,DictionaryArray,Date32Array,Date64Array,TimestampArray,Time32Array,Time64Array,DurationArray,MonthDayNanoIntervalArray,Decimal128Array,Decimal256Array,StructArray,ExtensionArray,RunEndEncodedArray,FixedShapeTensorArray,scalar,NA,_NULLasNULL,Scalar,NullScalar,BooleanScalar,Int8Scalar,Int16Scalar,Int32Scalar,Int64Scalar,UInt8Scalar,UInt16Scalar,UInt32Scalar,UInt64Scalar,HalfFloatScalar,FloatScalar,DoubleScalar,Decimal128Scalar,Decimal256Scalar,ListScalar,LargeListScalar,FixedSizeListScalar,Date32Scalar,Date64Scalar,Time32Scalar,Time64Scalar,TimestampScalar,DurationScalar,MonthDayNanoIntervalScalar,BinaryScalar,LargeBinaryScalar,StringScalar,LargeStringScalar,FixedSizeBinaryScalar,DictionaryScalar,MapScalar,StructScalar,UnionScalar,RunEndEncodedScalar,ExtensionScalar)# Buffers, allocationfrompyarrow.libimport(Buffer,ResizableBuffer,foreign_buffer,py_buffer,Codec,compress,decompress,allocate_buffer)frompyarrow.libimport(MemoryPool,LoggingMemoryPool,ProxyMemoryPool,total_allocated_bytes,set_memory_pool,default_memory_pool,system_memory_pool,jemalloc_memory_pool,mimalloc_memory_pool,logging_memory_pool,proxy_memory_pool,log_memory_allocations,jemalloc_set_decay_ms,supported_memory_backends)# I/Ofrompyarrow.libimport(NativeFile,PythonFile,BufferedInputStream,BufferedOutputStream,CompressedInputStream,CompressedOutputStream,TransformInputStream,transcoding_input_stream,FixedSizeBufferWriter,BufferReader,BufferOutputStream,OSFile,MemoryMappedFile,memory_map,create_memory_map,MockOutputStream,input_stream,output_stream)frompyarrow._hdfsioimportHdfsFile,have_libhdfsfrompyarrow.libimport(ChunkedArray,RecordBatch,Table,table,concat_arrays,concat_tables,TableGroupBy,RecordBatchReader)# Exceptionsfrompyarrow.libimport(ArrowCancelled,ArrowCapacityError,ArrowException,ArrowKeyError,ArrowIndexError,ArrowInvalid,ArrowIOError,ArrowMemoryError,ArrowNotImplementedError,ArrowTypeError,ArrowSerializationError)importpyarrow.hdfsashdfsfrompyarrow.ipcimportserialize_pandas,deserialize_pandasimportpyarrow.ipcasipcimportpyarrow.typesastypes# deprecated top-level accessfrompyarrow.filesystemimportFileSystemas_FileSystemfrompyarrow.filesystemimportLocalFileSystemas_LocalFileSystemfrompyarrow.hdfsimportHadoopFileSystemas_HadoopFileSystem_localfs=_LocalFileSystem._get_instance()_msg=("pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead.")_serialization_msg=("'pyarrow.{0}' is deprecated and will be removed in a future version. ""Use pickle or the pyarrow IPC functionality instead.")_deprecated={"localfs":(_localfs,"LocalFileSystem"),"FileSystem":(_FileSystem,"FileSystem"),"LocalFileSystem":(_LocalFileSystem,"LocalFileSystem"),"HadoopFileSystem":(_HadoopFileSystem,"HadoopFileSystem"),}def__getattr__(name):ifnamein_deprecated:obj,new_name=_deprecated[name]_warnings.warn(_msg.format(name,new_name),FutureWarning,stacklevel=2)returnobjraiseAttributeError("module 'pyarrow' has no attribute '{0}'".format(name))# ----------------------------------------------------------------------# Deprecationsfrompyarrow.utilimport_deprecate_api,_deprecate_class# TODO: Deprecate these somehow in the pyarrow namespacefrompyarrow.ipcimport(Message,MessageReader,MetadataVersion,RecordBatchFileReader,RecordBatchFileWriter,RecordBatchStreamReader,RecordBatchStreamWriter)# ----------------------------------------------------------------------# Returning absolute path to the pyarrow include directory (if bundled, e.g. in# wheels)
[docs]defget_include():""" Return absolute path to directory containing Arrow C++ include headers. Similar to numpy.get_include """return_os.path.join(_os.path.dirname(__file__),'include')
[docs]defget_libraries():""" Return list of library names to include in the `libraries` argument for C or Cython extensions using pyarrow """return['arrow_python','arrow']
defcreate_library_symlinks():""" With Linux and macOS wheels, the bundled shared libraries have an embedded ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them with -larrow won't work unless we create symlinks at locations like site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses prior problems we had with shipping two copies of the shared libraries to permit third party projects like turbodbc to build their C++ extensions against the pyarrow wheels. This function must only be invoked once and only when the shared libraries are bundled with the Python package, which should only apply to wheel-based installs. It requires write access to the site-packages/pyarrow directory and so depending on your system may need to be run with root. """importglobif_sys.platform=='win32':returnpackage_cwd=_os.path.dirname(__file__)if_sys.platform=='linux':bundled_libs=glob.glob(_os.path.join(package_cwd,'*.so.*'))defget_symlink_path(hard_path):returnhard_path.rsplit('.',1)[0]else:bundled_libs=glob.glob(_os.path.join(package_cwd,'*.*.dylib'))defget_symlink_path(hard_path):return'.'.join((hard_path.rsplit('.',2)[0],'dylib'))forlib_hard_pathinbundled_libs:symlink_path=get_symlink_path(lib_hard_path)if_os.path.exists(symlink_path):continuetry:_os.symlink(lib_hard_path,symlink_path)exceptPermissionError:print("Tried creating symlink {}. If you need to link to ""bundled shared libraries, run ""pyarrow.create_library_symlinks() as root")
[docs]defget_library_dirs():""" Return lists of directories likely to contain Arrow C++ libraries for linking C or Cython extensions using pyarrow """package_cwd=_os.path.dirname(__file__)library_dirs=[package_cwd]defappend_library_dir(library_dir):iflibrary_dirnotinlibrary_dirs:library_dirs.append(library_dir)# Search library paths via pkg-config. This is necessary if the user# installed libarrow and the other shared libraries manually and they# are not shipped inside the pyarrow package (see also ARROW-2976).pkg_config_executable=_os.environ.get('PKG_CONFIG')or'pkg-config'forpkgnamein["arrow","arrow_python"]:if_has_pkg_config(pkgname):library_dir=_read_pkg_config_variable(pkgname,["--libs-only-L"])# pkg-config output could be empty if Arrow is installed# as a system package.iflibrary_dir:ifnotlibrary_dir.startswith("-L"):raiseValueError("pkg-config --libs-only-L returned unexpected ""value {!r}".format(library_dir))append_library_dir(library_dir[2:])if_sys.platform=='win32':# TODO(wesm): Is this necessary, or does setuptools within a conda# installation add Library\lib to the linker path for MSVC?python_base_install=_os.path.dirname(_sys.executable)library_dir=_os.path.join(python_base_install,'Library','lib')if_os.path.exists(_os.path.join(library_dir,'arrow.lib')):append_library_dir(library_dir)# ARROW-4074: Allow for ARROW_HOME to be set to some other directoryif_os.environ.get('ARROW_HOME'):append_library_dir(_os.path.join(_os.environ['ARROW_HOME'],'lib'))else:# Python wheels bundle the Arrow libraries in the pyarrow directory.append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))returnlibrary_dirs