Spark Python API函数学习
pyspark version
1
print("pyspark version:" + str())
2
3
4
pyspark version:
map
01
# map
02
# sc = spark context, parallelize creates an RDD from the passed object
03
x = ([1,2,3])
04
y = (lambda x: (x,x**2))
05
06
# collect copies RDD elements to a list on the driver
07
print(())
08
print(())
09
10
[1, 2, 3]
11
[(1, 1), (2, 4), (3, 9)]
flatMap
1
# flatMap
2
x = ([1,2,3])
3
y = (lambda x: (x, 100*x, x**2))
4
print(())
5
print(())
6
7
[1, 2, 3]
8
[1, 100, 1, 2, 200, 4, 3, 300, 9]
mapPartitions
01
# mapPartitions
02
x = ([1,2,3], 2)
03
def f(iterator): yield sum(iterator)
04
y = (f)
05
# glom() flattens elements on the same partition
06
print(().collect())
07
print(().collect())
08
09
[[1], [2, 3]]
10
[[1], [5]]
mapPartitionsWithIndex
01
# mapPartitionsWithIndex
02
x = ([1,2,3], 2)
03
def f(partitionIndex, iterator): yield (partitionIndex,sum(iterator))
04
y = (f)
05
06
# glom() flattens elements on the same partition
07
print(().collect())
08
print(().collect())
09
10
[[1], [2, 3]]
11
[[(0, 1)], [(1, 5)]]
getNumPartitions
1
# getNumPartitions
2
x = ([1,2,3], 2)
3
y = ()
4
print(().collect())
5
print(y)
6
7
[[1], [2, 3]]
8
2
filter
1
# filter
2
x = ([1,2,3])
3
y = (lambda x: x%2 == 1) # filters out even elements
4
print(())
5
print(())
6
7
[1, 2, 3]
8
[1, 3]
distinct
1
# distinct
2
x = (['A','A','B'])
3
y = ()
4
print(())
5
print(())
6
7
['A', 'A', 'B']
8
['A', 'B']
sample
01
# sample
02
x = (range(7))
03
# call 'sample' 5 times
04
ylist = [(withReplacement
Spark Python API函数学习 来自淘豆网m.daumloan.com转载请标明出处.