# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.frompyarrow._computeimport(# noqaFunction,FunctionOptions,FunctionRegistry,HashAggregateFunction,HashAggregateKernel,Kernel,ScalarAggregateFunction,ScalarAggregateKernel,ScalarFunction,ScalarKernel,VectorFunction,VectorKernel,# Option classesArraySortOptions,AssumeTimezoneOptions,CastOptions,CountOptions,CumulativeOptions,CumulativeSumOptions,DayOfWeekOptions,DictionaryEncodeOptions,RunEndEncodeOptions,ElementWiseAggregateOptions,ExtractRegexOptions,FilterOptions,IndexOptions,JoinOptions,ListSliceOptions,MakeStructOptions,MapLookupOptions,MatchSubstringOptions,ModeOptions,NullOptions,PadOptions,PairwiseOptions,PartitionNthOptions,QuantileOptions,RandomOptions,RankOptions,ReplaceSliceOptions,ReplaceSubstringOptions,RoundBinaryOptions,RoundOptions,RoundTemporalOptions,RoundToMultipleOptions,ScalarAggregateOptions,SelectKOptions,SetLookupOptions,SliceOptions,SortOptions,SplitOptions,SplitPatternOptions,StrftimeOptions,StrptimeOptions,StructFieldOptions,TakeOptions,TDigestOptions,TrimOptions,Utf8NormalizeOptions,VarianceOptions,WeekOptions,# Functionscall_function,function_registry,get_function,list_functions,# Udfcall_tabular_function,register_scalar_function,register_tabular_function,register_aggregate_function,register_vector_function,UdfContext,# ExpressionsExpression,)fromcollectionsimportnamedtupleimportinspectfromtextwrapimportdedentimportwarningsimportpyarrowaspafrompyarrowimport_compute_docstringsfrompyarrow.vendoredimportdocscrapedef_get_arg_names(func):returnfunc._doc.arg_names_OptionsClassDoc=namedtuple('_OptionsClassDoc',('params',))def_scrape_options_class_doc(options_class):ifnotoptions_class.__doc__:returnNonedoc=docscrape.NumpyDocString(options_class.__doc__)return_OptionsClassDoc(doc['Parameters'])def_decorate_compute_function(wrapper,exposed_name,func,options_class):# Decorate the given compute function wrapper with useful metadata# and documentation.cpp_doc=func._docwrapper.__arrow_compute_function__=dict(name=func.name,arity=func.arity,options_class=cpp_doc.options_class,options_required=cpp_doc.options_required)wrapper.__name__=exposed_namewrapper.__qualname__=exposed_namedoc_pieces=[]# 1. One-line summarysummary=cpp_doc.summaryifnotsummary:arg_str="arguments"iffunc.arity>1else"argument"summary=("Call compute function {!r} with the given {}".format(func.name,arg_str))doc_pieces.append(f"{summary}.\n\n")# 2. Multi-line descriptiondescription=cpp_doc.descriptionifdescription:doc_pieces.append(f"{description}\n\n")doc_addition=_compute_docstrings.function_doc_additions.get(func.name)# 3. Parameter descriptiondoc_pieces.append(dedent("""\ Parameters ---------- """))# 3a. Compute function parametersarg_names=_get_arg_names(func)forarg_nameinarg_names:iffunc.kindin('vector','scalar_aggregate'):arg_type='Array-like'else:arg_type='Array-like or scalar-like'doc_pieces.append(f"{arg_name} : {arg_type}\n")doc_pieces.append(" Argument to compute function.\n")# 3b. Compute function option valuesifoptions_classisnotNone:options_class_doc=_scrape_options_class_doc(options_class)ifoptions_class_doc:forpinoptions_class_doc.params:doc_pieces.append(f"{p.name} : {p.type}\n")forsinp.desc:doc_pieces.append(f" {s}\n")else:warnings.warn(f"Options class {options_class.__name__} "f"does not have a docstring",RuntimeWarning)options_sig=inspect.signature(options_class)forpinoptions_sig.parameters.values():doc_pieces.append(dedent("""\{0} : optional Parameter for {1} constructor. Either `options` or `{0}` can be passed, but not both at the same time. """.format(p.name,options_class.__name__)))doc_pieces.append(dedent(f"""\ options : pyarrow.compute.{options_class.__name__}, optional Alternative way of passing options. """))doc_pieces.append(dedent("""\ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """))# 4. Custom addition (e.g. examples)ifdoc_additionisnotNone:doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n")))wrapper.__doc__="".join(doc_pieces)returnwrapperdef_get_options_class(func):class_name=func._doc.options_classifnotclass_name:returnNonetry:returnglobals()[class_name]exceptKeyError:warnings.warn("Python binding for {} not exposed".format(class_name),RuntimeWarning)returnNonedef_handle_options(name,options_class,options,args,kwargs):ifargsorkwargs:ifoptionsisnotNone:raiseTypeError("Function {!r} called with both an 'options' argument ""and additional arguments".format(name))returnoptions_class(*args,**kwargs)ifoptionsisnotNone:ifisinstance(options,dict):returnoptions_class(**options)elifisinstance(options,options_class):returnoptionsraiseTypeError("Function {!r} expected a {} parameter, got {}".format(name,options_class,type(options)))returnNonedef_make_generic_wrapper(func_name,func,options_class,arity):ifoptions_classisNone:defwrapper(*args,memory_pool=None):ifarityisnotEllipsisandlen(args)!=arity:raiseTypeError(f"{func_name} takes {arity} positional argument(s), "f"but {len(args)} were given")ifargsandisinstance(args[0],Expression):returnExpression._call(func_name,list(args))returnfunc.call(args,None,memory_pool)else:defwrapper(*args,memory_pool=None,options=None,**kwargs):ifarityisnotEllipsis:iflen(args)<arity:raiseTypeError(f"{func_name} takes {arity} positional argument(s), "f"but {len(args)} were given")option_args=args[arity:]args=args[:arity]else:option_args=()options=_handle_options(func_name,options_class,options,option_args,kwargs)ifargsandisinstance(args[0],Expression):returnExpression._call(func_name,list(args),options)returnfunc.call(args,options,memory_pool)returnwrapperdef_make_signature(arg_names,var_arg_names,options_class):frominspectimportParameterparams=[]fornameinarg_names:params.append(Parameter(name,Parameter.POSITIONAL_ONLY))fornameinvar_arg_names:params.append(Parameter(name,Parameter.VAR_POSITIONAL))ifoptions_classisnotNone:options_sig=inspect.signature(options_class)forpinoptions_sig.parameters.values():assertp.kindin(Parameter.POSITIONAL_OR_KEYWORD,Parameter.KEYWORD_ONLY)ifvar_arg_names:# Cannot have a positional argument after a *argsp=p.replace(kind=Parameter.KEYWORD_ONLY)params.append(p)params.append(Parameter("options",Parameter.KEYWORD_ONLY,default=None))params.append(Parameter("memory_pool",Parameter.KEYWORD_ONLY,default=None))returninspect.Signature(params)def_wrap_function(name,func):options_class=_get_options_class(func)arg_names=_get_arg_names(func)has_vararg=arg_namesandarg_names[-1].startswith('*')ifhas_vararg:var_arg_names=[arg_names.pop().lstrip('*')]else:var_arg_names=[]wrapper=_make_generic_wrapper(name,func,options_class,arity=func.arity)wrapper.__signature__=_make_signature(arg_names,var_arg_names,options_class)return_decorate_compute_function(wrapper,name,func,options_class)def_make_global_functions():""" Make global functions wrapping each compute function. Note that some of the automatically-generated wrappers may be overridden by custom versions below. """g=globals()reg=function_registry()# Avoid clashes with Python keywordsrewrites={'and':'and_','or':'or_'}forcpp_nameinreg.list_functions():name=rewrites.get(cpp_name,cpp_name)func=reg.get_function(cpp_name)iffunc.kind=="hash_aggregate":# Hash aggregate functions are not callable,# so let's not expose them at module level.continueiffunc.kind=="scalar_aggregate"andfunc.arity==0:# Nullary scalar aggregate functions are not callable# directly so let's not expose them at module level.continueassertnamenoting,nameg[cpp_name]=g[name]=_wrap_function(name,func)_make_global_functions()
[docs]defcast(arr,target_type=None,safe=None,options=None,memory_pool=None):""" Cast array values to another data type. Can also be invoked as an array instance method. Parameters ---------- arr : Array-like target_type : DataType or str Type to cast to safe : bool, default True Check for overflows or other unsafe conversions options : CastOptions, default None Additional checks pass by CastOptions memory_pool : MemoryPool, optional memory pool to use for allocations during function execution. Examples -------- >>> from datetime import datetime >>> import pyarrow as pa >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) >>> arr.type TimestampType(timestamp[us]) You can use ``pyarrow.DataType`` objects to specify the target type: >>> cast(arr, pa.timestamp('ms')) <pyarrow.lib.TimestampArray object at ...> [ 2010-01-01 00:00:00.000, 2015-01-01 00:00:00.000 ] >>> cast(arr, pa.timestamp('ms')).type TimestampType(timestamp[ms]) Alternatively, it is also supported to use the string aliases for these types: >>> arr.cast('timestamp[ms]') <pyarrow.lib.TimestampArray object at ...> [ 2010-01-01 00:00:00.000, 2015-01-01 00:00:00.000 ] >>> arr.cast('timestamp[ms]').type TimestampType(timestamp[ms]) Returns ------- casted : Array The cast result as a new Array """safe_vars_passed=(safeisnotNone)or(target_typeisnotNone)ifsafe_vars_passedand(optionsisnotNone):raiseValueError("Must either pass values for 'target_type' and 'safe'"" or pass a value for 'options'")ifoptionsisNone:target_type=pa.types.lib.ensure_type(target_type)ifsafeisFalse:options=CastOptions.unsafe(target_type)else:options=CastOptions.safe(target_type)returncall_function("cast",[arr],options,memory_pool)
[docs]defindex(data,value,start=None,end=None,*,memory_pool=None):""" Find the index of the first occurrence of a given value. Parameters ---------- data : Array-like value : Scalar-like object The value to search for. start : int, optional end : int, optional memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- index : int the index, or -1 if not found """ifstartisnotNone:ifendisnotNone:data=data.slice(start,end-start)else:data=data.slice(start)elifendisnotNone:data=data.slice(0,end)ifnotisinstance(value,pa.Scalar):value=pa.scalar(value,type=data.type)elifdata.type!=value.type:value=pa.scalar(value.as_py(),type=data.type)options=IndexOptions(value=value)result=call_function('index',[data],options,memory_pool)ifstartisnotNoneandresult.as_py()>=0:result=pa.scalar(result.as_py()+start,type=pa.int64())returnresult
[docs]deftake(data,indices,*,boundscheck=True,memory_pool=None):""" Select values (or records) from array- or table-like data given integer selection indices. The result will be of the same type(s) as the input, with elements taken from the input array (or record batch / table fields) at the given indices. If an index is null then the corresponding value in the output will be null. Parameters ---------- data : Array, ChunkedArray, RecordBatch, or Table indices : Array, ChunkedArray Must be of integer type boundscheck : boolean, default True Whether to boundscheck the indices. If False and there is an out of bounds index, will likely cause the process to crash. memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- result : depends on inputs Selected values for the given indices Examples -------- >>> import pyarrow as pa >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> indices = pa.array([0, None, 4, 3]) >>> arr.take(indices) <pyarrow.lib.StringArray object at ...> [ "a", null, "e", null ] """options=TakeOptions(boundscheck=boundscheck)returncall_function('take',[data,indices],options,memory_pool)
[docs]deffill_null(values,fill_value):"""Replace each null element in values with a corresponding element from fill_value. If fill_value is scalar-like, then every null element in values will be replaced with fill_value. If fill_value is array-like, then the i-th element in values will be replaced with the i-th element in fill_value. The fill_value's type must be the same as that of values, or it must be able to be implicitly casted to the array's type. This is an alias for :func:`coalesce`. Parameters ---------- values : Array, ChunkedArray, or Scalar-like object Each null element is replaced with the corresponding value from fill_value. fill_value : Array, ChunkedArray, or Scalar-like object If not same type as values, will attempt to cast. Returns ------- result : depends on inputs Values with all null elements replaced Examples -------- >>> import pyarrow as pa >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) >>> fill_value = pa.scalar(5, type=pa.int8()) >>> arr.fill_null(fill_value) <pyarrow.lib.Int8Array object at ...> [ 1, 2, 5, 3 ] >>> arr = pa.array([1, 2, None, 4, None]) >>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) <pyarrow.lib.Int64Array object at ...> [ 1, 2, 30, 4, 50 ] """ifnotisinstance(fill_value,(pa.Array,pa.ChunkedArray,pa.Scalar)):fill_value=pa.scalar(fill_value,type=values.type)elifvalues.type!=fill_value.type:fill_value=pa.scalar(fill_value.as_py(),type=values.type)returncall_function("coalesce",[values,fill_value])
deftop_k_unstable(values,k,sort_keys=None,*,memory_pool=None):""" Select the indices of the top-k ordered elements from array- or table-like data. This is a specialization for :func:`select_k_unstable`. Output is not guaranteed to be stable. Parameters ---------- values : Array, ChunkedArray, RecordBatch, or Table Data to sort and get top indices from. k : int The number of `k` elements to keep. sort_keys : List-like Column key names to order by when input is table-like data. memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- result : Array Indices of the top-k ordered elements Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> pc.top_k_unstable(arr, k=3) <pyarrow.lib.UInt64Array object at ...> [ 5, 4, 2 ] """ifsort_keysisNone:sort_keys=[]ifisinstance(values,(pa.Array,pa.ChunkedArray)):sort_keys.append(("dummy","descending"))else:sort_keys=map(lambdakey_name:(key_name,"descending"),sort_keys)options=SelectKOptions(k,sort_keys)returncall_function("select_k_unstable",[values],options,memory_pool)defbottom_k_unstable(values,k,sort_keys=None,*,memory_pool=None):""" Select the indices of the bottom-k ordered elements from array- or table-like data. This is a specialization for :func:`select_k_unstable`. Output is not guaranteed to be stable. Parameters ---------- values : Array, ChunkedArray, RecordBatch, or Table Data to sort and get bottom indices from. k : int The number of `k` elements to keep. sort_keys : List-like Column key names to order by when input is table-like data. memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- result : Array of indices Indices of the bottom-k ordered elements Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> pc.bottom_k_unstable(arr, k=3) <pyarrow.lib.UInt64Array object at ...> [ 0, 1, 2 ] """ifsort_keysisNone:sort_keys=[]ifisinstance(values,(pa.Array,pa.ChunkedArray)):sort_keys.append(("dummy","ascending"))else:sort_keys=map(lambdakey_name:(key_name,"ascending"),sort_keys)options=SelectKOptions(k,sort_keys)returncall_function("select_k_unstable",[values],options,memory_pool)defrandom(n,*,initializer='system',options=None,memory_pool=None):""" Generate numbers in the range [0, 1). Generated values are uniformly-distributed, double-precision in range [0, 1). Algorithm and seed can be changed via RandomOptions. Parameters ---------- n : int Number of values to generate, must be greater than or equal to 0 initializer : int or str How to initialize the underlying random generator. If an integer is given, it is used as a seed. If "system" is given, the random generator is initialized with a system-specific source of (hopefully true) randomness. Other values are invalid. options : pyarrow.compute.RandomOptions, optional Alternative way of passing options. memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """options=RandomOptions(initializer=initializer)returncall_function("random",[],options,memory_pool,length=n)
[docs]deffield(*name_or_index):"""Reference a column of the dataset. Stores only the field's name. Type and other information is known only when the expression is bound to a dataset having an explicit scheme. Nested references are allowed by passing multiple names or a tuple of names. For example ``('foo', 'bar')`` references the field named "bar" inside the field named "foo". Parameters ---------- *name_or_index : string, multiple strings, tuple or int The name or index of the (possibly nested) field the expression references to. Returns ------- field_expr : Expression Reference to the given field Examples -------- >>> import pyarrow.compute as pc >>> pc.field("a") <pyarrow.compute.Expression a> >>> pc.field(1) <pyarrow.compute.Expression FieldPath(1)> >>> pc.field(("a", "b")) <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ... >>> pc.field("a", "b") <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ... """n=len(name_or_index)ifn==1:ifisinstance(name_or_index[0],(str,int)):returnExpression._field(name_or_index[0])elifisinstance(name_or_index[0],tuple):returnExpression._nested_field(name_or_index[0])else:raiseTypeError("field reference should be str, multiple str, tuple or "f"integer, got {type(name_or_index[0])}")# In case of multiple strings not supplied in a tupleelse:returnExpression._nested_field(name_or_index)
[docs]defscalar(value):"""Expression representing a scalar value. Parameters ---------- value : bool, int, float or string Python value of the scalar. Note that only a subset of types are currently supported. Returns ------- scalar_expr : Expression An Expression representing the scalar value """returnExpression._scalar(value)