| Creation |
sc.parallelize(data) |
Create RDD from a Python list |
rdd = sc.parallelize([1,2,3]) |
|
sc.textFile(path) |
Read text file (each line = 1 element) |
rdd = sc.textFile("data.txt") |
|
sc.wholeTextFiles(path) |
Read files as (filename, content) pairs |
rdd = sc.wholeTextFiles("folder/") |
| Basic Transformations |
map(func) |
Apply function to each element |
rdd.map(lambda x: x*2) |
|
flatMap(func) |
Apply function & flatten result |
rdd.flatMap(lambda x: x.split(" ")) |
|
filter(func) |
Keep elements matching condition |
rdd.filter(lambda x: x>10) |
|
distinct() |
Remove duplicate elements |
rdd.distinct() |
|
union(rdd2) |
Combine two RDDs |
rdd1.union(rdd2) |
|
intersection(rdd2) |
Keep common elements |
rdd1.intersection(rdd2) |
|
subtract(rdd2) |
Elements in first RDD but not second |
rdd1.subtract(rdd2) |
|
cartesian(rdd2) |
All pairs between two RDDs |
rdd1.cartesian(rdd2) |
|
sample(withReplacement, fraction) |
Random sample |
rdd.sample(False, 0.5) |
|
sortBy(func, ascending) |
Sort elements by key or value |
rdd.sortBy(lambda x: x, False) |
| Pair RDD (Key, Value) |
mapValues(func) |
Apply function to values only |
rdd.mapValues(lambda x: x+1) |
|
reduceByKey(func) |
Merge values with same key (aggregated) |
rdd.reduceByKey(lambda a,b:a+b) |
|
groupByKey() |
Group values by key |
rdd.groupByKey() |
|
sortByKey() |
Sort key-value pairs by key |
rdd.sortByKey() |
|
join(rdd2) |
Inner join by key |
rdd1.join(rdd2) |
|
leftOuterJoin(rdd2) |
Keep all keys from left RDD |
rdd1.leftOuterJoin(rdd2) |
|
rightOuterJoin(rdd2) |
Keep all keys from right RDD |
rdd1.rightOuterJoin(rdd2) |
|
cogroup(rdd2) |
Group values from both RDDs by key |
rdd1.cogroup(rdd2) |
|
combineByKey() |
Custom aggregation (advanced) |
rdd.combineByKey(...) |
| Actions |
collect() |
Return all elements as list |
rdd.collect() |
|
count() |
Count number of elements |
rdd.count() |
|
first() |
First element |
rdd.first() |
|
take(n) |
First n elements |
rdd.take(5) |
|
top(n) |
Return top n elements |
rdd.top(3) |
|
reduce(func) |
Reduce using given function |
rdd.reduce(lambda a,b:a+b) |
|
sum() |
Sum of all elements |
rdd.sum() |
|
max() / min() |
Maximum / minimum element |
rdd.max() |
|
countByValue() |
Count frequency of each value |
rdd.countByValue() |
|
saveAsTextFile(path) |
Save results to text files |
rdd.saveAsTextFile("output/") |
|
foreach(func) |
Apply function to each element |
rdd.foreach(print) |
| Persistence / Storage |
cache() |
Store RDD in memory |
rdd.cache() |
|
persist() |
Store with custom storage level |
rdd.persist(StorageLevel.MEMORY_AND_DISK) |
|
unpersist() |
Remove RDD from cache |
rdd.unpersist() |
|
is_cached |
Check if cached |
rdd.is_cached |
| Partitioning / Inspection |
getNumPartitions() |
Get number of partitions |
rdd.getNumPartitions() |
|
repartition(n) |
Increase partitions (with shuffle) |
rdd.repartition(4) |
|
coalesce(n) |
Decrease partitions (no shuffle) |
rdd.coalesce(2) |
|
glom() |
Show data per partition |
rdd.glom().collect() |
|
mapPartitions(func) |
Operate on each partition |
rdd.mapPartitions(lambda x:[sum(x)]) |
|
mapPartitionsWithIndex(func) |
See which elements belong to which partition |
rdd.mapPartitionsWithIndex(lambda i, x: [(i, list(x))]).collect() |
| File Operations |
textFile(path) |
Load data from text file |
sc.textFile("data.txt") |
|
wholeTextFiles(path) |
Load (filename, content) pairs |
sc.wholeTextFiles("folder/") |
|
saveAsTextFile(path) |
Save RDD to text files |
rdd.saveAsTextFile("out/") |
|
saveAsSequenceFile(path) |
Save RDD as Hadoop sequence file |
rdd.saveAsSequenceFile("seq_out") |
|
saveAsObjectFile(path) |
Save as serialized objects |
rdd.saveAsObjectFile("obj_out") |
Comments
Post a Comment