column_collection_functions {SparkR} | R Documentation |
Collection functions defined for Column
.
array_aggregate(x, initialValue, merge, ...) array_contains(x, value) array_distinct(x) array_except(x, y) array_exists(x, f) array_forall(x, f) array_filter(x, f) array_intersect(x, y) array_join(x, delimiter, ...) array_max(x) array_min(x) array_position(x, value) array_remove(x, value) array_repeat(x, count) array_sort(x) array_transform(x, f) arrays_overlap(x, y) array_union(x, y) arrays_zip(x, ...) arrays_zip_with(x, y, f) concat(x, ...) element_at(x, extraction) explode(x) explode_outer(x) flatten(x) from_json(x, schema, ...) from_csv(x, schema, ...) map_concat(x, ...) map_entries(x) map_filter(x, f) map_from_arrays(x, y) map_from_entries(x) map_keys(x) map_values(x) map_zip_with(x, y, f) posexplode(x) posexplode_outer(x) reverse(x) schema_of_csv(x, ...) schema_of_json(x, ...) shuffle(x) size(x) slice(x, start, length) sort_array(x, asc = TRUE) transform_keys(x, f) transform_values(x, f) to_json(x, ...) to_csv(x, ...) ## S4 method for signature 'Column' reverse(x) ## S4 method for signature 'Column' to_json(x, ...) ## S4 method for signature 'Column' to_csv(x, ...) ## S4 method for signature 'Column' concat(x, ...) ## S4 method for signature 'Column,characterOrstructTypeOrColumn' from_json(x, schema, as.json.array = FALSE, ...) ## S4 method for signature 'characterOrColumn' schema_of_json(x, ...) ## S4 method for signature 'Column,characterOrstructTypeOrColumn' from_csv(x, schema, ...) ## S4 method for signature 'characterOrColumn' schema_of_csv(x, ...) ## S4 method for signature 'characterOrColumn,Column,'function'' array_aggregate(x, initialValue, merge, finish = NULL) ## S4 method for signature 'Column' array_contains(x, value) ## S4 method for signature 'Column' array_distinct(x) ## S4 method for signature 'Column,Column' array_except(x, y) ## S4 method for signature 'characterOrColumn,'function'' array_exists(x, f) ## S4 method for signature 'characterOrColumn,'function'' array_filter(x, f) ## S4 method for signature 'characterOrColumn,'function'' array_forall(x, f) ## S4 method for signature 'Column,Column' array_intersect(x, y) ## S4 method for signature 'Column,character' array_join(x, delimiter, nullReplacement = NULL) ## S4 method for signature 'Column' array_max(x) ## S4 method for signature 'Column' array_min(x) ## S4 method for signature 'Column' array_position(x, value) ## S4 method for signature 'Column' array_remove(x, value) ## S4 method for signature 'Column,numericOrColumn' array_repeat(x, count) ## S4 method for signature 'Column' array_sort(x) ## S4 method for signature 'characterOrColumn,'function'' array_transform(x, f) ## S4 method for signature 'Column,Column' arrays_overlap(x, y) ## S4 method for signature 'Column,Column' array_union(x, y) ## S4 method for signature 'Column' arrays_zip(x, ...) ## S4 method for signature 'characterOrColumn,characterOrColumn,'function'' arrays_zip_with(x, y, f) ## S4 method for signature 'Column' shuffle(x) ## S4 method for signature 'Column' flatten(x) ## S4 method for signature 'Column' map_concat(x, ...) ## S4 method for signature 'Column' map_entries(x) ## S4 method for signature 'characterOrColumn,'function'' map_filter(x, f) ## S4 method for signature 'Column,Column' map_from_arrays(x, y) ## S4 method for signature 'Column' map_from_entries(x) ## S4 method for signature 'Column' map_keys(x) ## S4 method for signature 'characterOrColumn,'function'' transform_keys(x, f) ## S4 method for signature 'characterOrColumn,'function'' transform_values(x, f) ## S4 method for signature 'Column' map_values(x) ## S4 method for signature 'characterOrColumn,characterOrColumn,'function'' map_zip_with(x, y, f) ## S4 method for signature 'Column' element_at(x, extraction) ## S4 method for signature 'Column' explode(x) ## S4 method for signature 'Column' size(x) ## S4 method for signature 'Column' slice(x, start, length) ## S4 method for signature 'Column' sort_array(x, asc = TRUE) ## S4 method for signature 'Column' posexplode(x) ## S4 method for signature 'Column' explode_outer(x) ## S4 method for signature 'Column' posexplode_outer(x)
x |
Column to compute on. Note the difference in the following methods:
|
initialValue |
a |
merge |
a |
... |
additional argument(s).
|
value |
A value to compute on.
|
y |
Column to compute on. |
f |
a
|
delimiter |
a character string that is used to concatenate the elements of column. |
count |
a Column or constant determining the number of repetitions. |
extraction |
index to check for in array or key to check for in map |
schema |
|
start |
the starting index |
length |
the length of the slice |
asc |
a logical flag indicating the sorting order. TRUE, sorting is in ascending order. FALSE, sorting is in descending order. |
as.json.array |
indicating if input string is JSON array of objects or a single object. |
finish |
an unary |
nullReplacement |
an optional character string that is used to replace the Null values. |
reverse
: Returns a reversed string or an array with reverse order of elements.
to_json
: Converts a column containing a structType
, a mapType
or an arrayType
into a Column of JSON string.
Resolving the Column can fail if an unsupported type is encountered.
to_csv
: Converts a column containing a structType
into a Column of CSV string.
Resolving the Column can fail if an unsupported type is encountered.
concat
: Concatenates multiple input columns together into a single column.
The function works with strings, binary and compatible array columns.
from_json
: Parses a column containing a JSON string into a Column of structType
with the specified schema
or array of structType
if as.json.array
is set
to TRUE
. If the string is unparseable, the Column will contain the value NA.
schema_of_json
: Parses a JSON string and infers its schema in DDL format.
from_csv
: Parses a column containing a CSV string into a Column of structType
with the specified schema
.
If the string is unparseable, the Column will contain the value NA.
schema_of_csv
: Parses a CSV string and infers its schema in DDL format.
array_aggregate
Applies a binary operator to an initial state
and all elements in the array, and reduces this to a single state.
The final state is converted into the final result by applying
a finish function.
array_contains
: Returns null if the array is null, true if the array contains
the value, and false otherwise.
array_distinct
: Removes duplicate values from the array.
array_except
: Returns an array of the elements in the first array but not in the second
array, without duplicates. The order of elements in the result is not determined.
array_exists
Returns whether a predicate holds for one or more elements in the array.
array_filter
Returns an array of elements for which a predicate holds in a given array.
array_forall
Returns whether a predicate holds for every element in the array.
array_intersect
: Returns an array of the elements in the intersection of the given two
arrays, without duplicates.
array_join
: Concatenates the elements of column using the delimiter.
Null values are replaced with nullReplacement if set, otherwise they are ignored.
array_max
: Returns the maximum value of the array.
array_min
: Returns the minimum value of the array.
array_position
: Locates the position of the first occurrence of the given value
in the given array. Returns NA if either of the arguments are NA.
Note: The position is not zero based, but 1 based index. Returns 0 if the given
value could not be found in the array.
array_remove
: Removes all elements that equal to element from the given array.
array_repeat
: Creates an array containing x
repeated the number of times
given by count
.
array_sort
: Sorts the input array in ascending order. The elements of the input array
must be orderable. NA elements will be placed at the end of the returned array.
array_transform
Returns an array of elements after applying
a transformation to each element in the input array.
arrays_overlap
: Returns true if the input arrays have at least one non-null element in
common. If not and both arrays are non-empty and any of them contains a null, it returns null.
It returns false otherwise.
array_union
: Returns an array of the elements in the union of the given two arrays,
without duplicates.
arrays_zip
: Returns a merged array of structs in which the N-th struct contains all N-th
values of input arrays.
arrays_zip_with
Merge two given arrays, element-wise, into a single array
using a function. If one array is shorter, nulls are appended at the end
to match the length of the longer array, before applying the function.
shuffle
: Returns a random permutation of the given array.
flatten
: Creates a single array from an array of arrays.
If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
map_concat
: Returns the union of all the given maps.
map_entries
: Returns an unordered array of all entries in the given map.
map_filter
Returns a map whose key-value pairs satisfy a predicate.
map_from_arrays
: Creates a new map column. The array in the first column is used for
keys. The array in the second column is used for values. All elements in the array for key
should not be null.
map_from_entries
: Returns a map created from the given array of entries.
map_keys
: Returns an unordered array containing the keys of the map.
transform_keys
Applies a function to every key-value pair in a map and returns
a map with the results of those applications as the new keys for the pairs.
transform_values
Applies a function to every key-value pair in a map and returns
a map with the results of those applications as the new values for the pairs.
map_values
: Returns an unordered array containing the values of the map.
map_zip
Merge two given maps, key-wise into a single map using a function.
element_at
: Returns element of array at given index in extraction
if
x
is array. Returns value for the given key in extraction
if x
is map.
Note: The position is not zero based, but 1 based index.
explode
: Creates a new row for each element in the given array or map column.
Uses the default column name col
for elements in the array and
key
and value
for elements in the map unless specified otherwise.
size
: Returns length of array or map.
slice
: Returns an array containing all the elements in x from the index start
(array indices start at 1, or from the end if start is negative) with the specified length.
sort_array
: Sorts the input array in ascending or descending order according to
the natural ordering of the array elements. NA elements will be placed at the beginning of
the returned array in ascending order or at the end of the returned array in descending order.
posexplode
: Creates a new row for each element with position in the given array
or map column. Uses the default column name pos
for position, and col
for elements in the array and key
and value
for elements in the map
unless specified otherwise.
explode
: Creates a new row for each element in the given array or map column.
Unlike explode
, if the array/map is null
or empty
then null
is produced.
Uses the default column name col
for elements in the array and
key
and value
for elements in the map unless specified otherwise.
posexplode_outer
: Creates a new row for each element with position in the given
array or map column. Unlike posexplode
, if the array/map is null
or empty
then the row (null
, null
) is produced.
Uses the default column name pos
for position, and col
for elements in the array and key
and value
for elements in the map
unless specified otherwise.
reverse since 1.5.0
to_json since 2.2.0
to_csv since 3.0.0
concat since 1.5.0
from_json since 2.2.0
schema_of_json since 3.0.0
from_csv since 3.0.0
schema_of_csv since 3.0.0
array_aggregate since 3.1.0
array_contains since 1.6.0
array_distinct since 2.4.0
array_except since 2.4.0
array_exists since 3.1.0
array_filter since 3.1.0
array_forall since 3.1.0
array_intersect since 2.4.0
array_join since 2.4.0
array_max since 2.4.0
array_min since 2.4.0
array_position since 2.4.0
array_remove since 2.4.0
array_repeat since 2.4.0
array_sort since 2.4.0
array_transform since 3.1.0
arrays_overlap since 2.4.0
array_union since 2.4.0
arrays_zip since 2.4.0
zip_with since 3.1.0
shuffle since 2.4.0
flatten since 2.4.0
map_concat since 3.0.0
map_entries since 3.0.0
map_filter since 3.1.0
map_from_arrays since 2.4.0
map_from_entries since 3.0.0
map_keys since 2.3.0
transform_keys since 3.1.0
transform_values since 3.1.0
map_values since 2.3.0
map_zip_with since 3.1.0
element_at since 2.4.0
explode since 1.5.0
size since 1.5.0
slice since 2.4.0
sort_array since 1.6.0
posexplode since 2.1.0
explode_outer since 2.3.0
posexplode_outer since 2.3.0
## Not run:
##D # Dataframe used throughout this doc
##D df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
##D tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
##D head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1)))
##D head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
##D head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
##D head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21)))
##D head(select(tmp, array_transform("v1", function(x) x * 10)))
##D head(select(tmp, array_exists("v1", function(x) x > 120)))
##D head(select(tmp, array_forall("v1", function(x) x >= 8.0)))
##D head(select(tmp, array_filter("v1", function(x) x < 10)))
##D head(select(tmp, array_aggregate("v1", lit(0), function(acc, y) acc + y)))
##D head(select(
##D tmp,
##D array_aggregate("v1", lit(0), function(acc, y) acc + y, function(acc) acc / 10)))
##D tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
##D head(tmp2)
##D head(select(tmp, posexplode(tmp$v1)))
##D head(select(tmp, slice(tmp$v1, 2L, 2L)))
##D head(select(tmp, sort_array(tmp$v1)))
##D head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
##D tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
##D head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
##D head(select(tmp3, element_at(tmp3$v3, "Valiant"), map_concat(tmp3$v3, tmp3$v3)))
##D head(select(tmp3, transform_keys("v3", function(k, v) upper(k))))
##D head(select(tmp3, transform_values("v3", function(k, v) v * 10)))
##D head(select(tmp3, map_filter("v3", function(k, v) v < 42)))
##D tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
##D head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
##D head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
##D head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
##D head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5)))
##D head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
##D head(select(tmp4, arrays_zip_with(tmp4$v4, tmp4$v5, function(x, y) x * y)))
##D tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
##D head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))
##D tmp6 <- mutate(df, v7 = create_array(create_array(df$model, df$model)))
##D head(select(tmp6, flatten(tmp6$v7)))
##D tmp7 <- mutate(df, v8 = create_array(df$model, df$cyl), v9 = create_array(df$model, df$hp))
##D head(select(tmp7, arrays_zip_with("v8", "v9", function(x, y) (x * y) %% 3)))
##D head(select(tmp7, map_from_arrays(tmp7$v8, tmp7$v9)))
##D tmp8 <- mutate(df, v10 = create_array(struct(df$model, df$cyl)))
##D head(select(tmp8, map_from_entries(tmp8$v10)))
## End(Not run)
## Not run:
##D # Converts a struct into a JSON object
##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
##D select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
##D
##D # Converts an array of structs into a JSON array
##D df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
##D df2 <- mutate(df2, people_json = to_json(df2$people))
##D
##D # Converts a map into a JSON object
##D df2 <- sql("SELECT map('name', 'Bob') as people")
##D df2 <- mutate(df2, people_json = to_json(df2$people))
##D
##D # Converts an array of maps into a JSON array
##D df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
##D df2 <- mutate(df2, people_json = to_json(df2$people))
##D
##D # Converts a map into a pretty JSON object
##D df2 <- sql("SELECT map('name', 'Bob') as people")
##D df2 <- mutate(df2, people_json = to_json(df2$people, pretty = TRUE))
## End(Not run)
## Not run:
##D # Converts a struct into a CSV string
##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
##D select(df2, to_csv(df2$d, dateFormat = 'dd/MM/yyyy'))
## End(Not run)
## Not run:
##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
##D df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
##D schema <- structType(structField("date", "string"))
##D head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy')))
##D df2 <- sql("SELECT named_struct('name', 'Bob') as people")
##D df2 <- mutate(df2, people_json = to_json(df2$people))
##D schema <- structType(structField("name", "string"))
##D head(select(df2, from_json(df2$people_json, schema)))
##D head(select(df2, from_json(df2$people_json, "name STRING")))
##D head(select(df2, from_json(df2$people_json, schema_of_json(head(df2)$people_json))))
## End(Not run)
## Not run:
##D json <- "{\"name\":\"Bob\"}"
##D df <- sql("SELECT * FROM range(1)")
##D head(select(df, schema_of_json(json)))
## End(Not run)
## Not run:
##D csv <- "Amsterdam,2018"
##D df <- sql(paste0("SELECT '", csv, "' as csv"))
##D schema <- "city STRING, year INT"
##D head(select(df, from_csv(df$csv, schema)))
##D head(select(df, from_csv(df$csv, structType(schema))))
##D head(select(df, from_csv(df$csv, schema_of_csv(csv))))
## End(Not run)
## Not run:
##D csv <- "Amsterdam,2018"
##D df <- sql("SELECT * FROM range(1)")
##D head(select(df, schema_of_csv(csv)))
## End(Not run)
## Not run:
##D df2 <- createDataFrame(data.frame(
##D id = c(1, 2, 3), text = c("a,b,c", NA, "d,e")
##D ))
##D
##D head(select(df2, df2$id, explode_outer(split_string(df2$text, ","))))
##D head(select(df2, df2$id, posexplode_outer(split_string(df2$text, ","))))
## End(Not run)