digraph G {
0 [labelType="html" label="<br><b>AdaptiveSparkPlan</b><br><br>"];
subgraph cluster1 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: 3 ms";
2 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build: 2 ms<br>number of output rows: 1"];
}
3 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 72<br>local merged chunks fetched: 0<br>shuffle write time total (min, med, max (stageId: taskId))<br>24 ms (0 ms, 0 ms, 1 ms (stage 154.0: task 656))<br>remote merged bytes read: 0.0 B<br>local merged blocks fetched: 0<br>corrupt merged block chunks: 0<br>remote merged reqs duration: 0 ms<br>remote merged blocks fetched: 0<br>records read: 72<br>local bytes read: 4.1 KiB<br>fetch wait time: 0 ms<br>remote bytes read: 0.0 B<br>merged fetch fallback count: 0<br>local blocks read: 72<br>remote merged chunks fetched: 0<br>remote blocks read: 0<br>data size total (min, med, max (stageId: taskId))<br>1152.0 B (0.0 B, 16.0 B, 16.0 B (stage 154.0: task 617))<br>local merged bytes read: 0.0 B<br>number of partitions: 1<br>remote reqs duration: 0 ms<br>remote bytes read to disk: 0.0 B<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>4.1 KiB (0.0 B, 59.0 B, 59.0 B (stage 154.0: task 617))"];
subgraph cluster4 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n4.8 s (6 ms, 73 ms, 186 ms (stage 154.0: task 631))";
5 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>4.7 s (5 ms, 73 ms, 186 ms (stage 154.0: task 631))<br>number of output rows: 72"];
}
6 [labelType="html" label="<b>InMemoryTableScan</b><br><br>number of output rows: 48,408,771"];
7 [labelType="html" label="<br><b>AdaptiveSparkPlan</b><br><br>"];
subgraph cluster8 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))";
9 [labelType="html" label="<br><b>Project</b><br><br>"];
}
10 [labelType="html" label="<b>Window</b><br><br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))"];
subgraph cluster11 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))";
12 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>peak memory total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))"];
}
13 [labelType="html" label="<br><b>AQEShuffleRead</b><br><br>"];
14 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 0<br>local merged chunks fetched: 0<br>shuffle write time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>remote merged bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>local merged blocks fetched: 0<br>corrupt merged block chunks: 0<br>remote merged reqs duration total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>remote merged blocks fetched: 0<br>records read: 0<br>local bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>fetch wait time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>remote bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>merged fetch fallback count: 0<br>local blocks read: 0<br>remote merged chunks fetched: 0<br>remote blocks read: 0<br>data size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>local merged bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>number of partitions: 0<br>remote reqs duration total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>remote bytes read to disk total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))"];
subgraph cluster15 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))";
16 [labelType="html" label="<br><b>Project</b><br><br>"];
}
17 [labelType="html" label="<b>Scan csv </b><br><br>number of output rows: 0<br>number of files read: 0<br>metadata time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 154.0: task 617))<br>size of files read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 154.0: task 617))"];
2->0;
3->2;
5->3;
6->5;
7->6;
9->7;
10->9;
12->10;
13->12;
14->13;
16->14;
17->16;
}
18
AdaptiveSparkPlan isFinalPlan=true
HashAggregate(keys=[], functions=[count(1)])
WholeStageCodegen (2)
Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=2254]
HashAggregate(keys=[], functions=[partial_count(1)])
WholeStageCodegen (1)
InMemoryTableScan
AdaptiveSparkPlan isFinalPlan=true
Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, 2026-04-08 09:56:30.312891 AS _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649]
WholeStageCodegen (3)
Window [row_number() windowspecdefinition(start_station_id#10535, started_at#10532 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#10647], [start_station_id#10535], [started_at#10532 ASC NULLS FIRST]
Sort [start_station_id#10535 ASC NULLS FIRST, started_at#10532 ASC NULLS FIRST], false, 0
WholeStageCodegen (2)
AQEShuffleRead coalesced
Exchange hashpartitioning(start_station_id#10535, 200), ENSURE_REQUIREMENTS, [plan_id=2133]
Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, cast(start_station_id#10535 as double) AS start_station_id#10655, end_station_name#10536, cast(end_station_id#10537 as double) AS end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, isnotnull(ride_id#10530) AS valid_ride_id#10641, (ended_at#10533 > started_at#10532) AS valid_time#10642, (((isnotnull(end_station_id#10537) AND isnotnull(start_station_id#10535)) AND NOT (end_station_id#10537 = start_station_id#10535)) <=> true) AS valid_station#10643, input_file_name() AS _source_file#10644, year(cast(started_at#10532 as date)) AS year#10648, month(cast(started_at#10532 as date)) AS month#10649, start_station_id#10535, started_at#10532]
WholeStageCodegen (1)
FileScan csv [ride_id#10530,rideable_type#10531,started_at#10532,ended_at#10533,start_station_name#10534,start_station_id#10535,end_station_name#10536,end_station_id#10537,start_lat#10538,start_lng#10539,end_lat#10540,end_lng#10541,member_casual#10542] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(55 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
== Physical Plan ==
AdaptiveSparkPlan (18)
+- == Final Plan ==
* HashAggregate (14)
+- ShuffleQueryStage (13), Statistics(sizeInBytes=1152.0 B, rowCount=72)
+- Exchange (12)
+- * HashAggregate (11)
+- TableCacheQueryStage (10), Statistics(sizeInBytes=12.5 GiB, rowCount=4.84E+7)
+- InMemoryTableScan (1)
+- InMemoryRelation (2)
+- AdaptiveSparkPlan (9)
+- Project (8)
+- Window (7)
+- Sort (6)
+- Exchange (5)
+- Project (4)
+- Scan csv (3)
+- == Initial Plan ==
HashAggregate (17)
+- Exchange (16)
+- HashAggregate (15)
+- InMemoryTableScan (1)
+- InMemoryRelation (2)
+- AdaptiveSparkPlan (9)
+- Project (8)
+- Window (7)
+- Sort (6)
+- Exchange (5)
+- Project (4)
+- Scan csv (3)
(1) InMemoryTableScan
Output: []
(2) InMemoryRelation
Arguments: [ride_id#10650, rideable_type#10651, started_at#10652, ended_at#10653, start_station_name#10654, start_station_id#10655, end_station_name#10656, end_station_id#10657, start_lat#10658, start_lng#10659, end_lat#10660, end_lng#10661, member_casual#10662, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649], CachedRDDBuilder(org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer@43d79ed6,StorageLevel(disk, memory, deserialized, 1 replicas),AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
*(3) Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, 2026-04-08 09:56:30.312891 AS _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649]
+- Window [row_number() windowspecdefinition(start_station_id#10535, started_at#10532 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#10647], [start_station_id#10535], [started_at#10532 ASC NULLS FIRST]
+- *(2) Sort [start_station_id#10535 ASC NULLS FIRST, started_at#10532 ASC NULLS FIRST], false, 0
+- AQEShuffleRead coalesced
+- ShuffleQueryStage 0
+- Exchange hashpartitioning(start_station_id#10535, 200), ENSURE_REQUIREMENTS, [plan_id=2133]
+- *(1) Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, cast(start_station_id#10535 as double) AS start_station_id#10655, end_station_name#10536, cast(end_station_id#10537 as double) AS end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, isnotnull(ride_id#10530) AS valid_ride_id#10641, (ended_at#10533 > started_at#10532) AS valid_time#10642, (((isnotnull(end_station_id#10537) AND isnotnull(start_station_id#10535)) AND NOT (end_station_id#10537 = start_station_id#10535)) <=> true) AS valid_station#10643, input_file_name() AS _source_file#10644, year(cast(started_at#10532 as date)) AS year#10648, month(cast(started_at#10532 as date)) AS month#10649, start_station_id#10535, started_at#10532]
+- FileScan csv [ride_id#10530,rideable_type#10531,started_at#10532,ended_at#10533,start_station_name#10534,start_station_id#10535,end_station_name#10536,end_station_id#10537,start_lat#10538,start_lng#10539,end_lat#10540,end_lng#10541,member_casual#10542] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(55 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
+- == Initial Plan ==
Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, 2026-04-08 09:56:30.312891 AS _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649]
+- Window [row_number() windowspecdefinition(start_station_id#10535, started_at#10532 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#10647], [start_station_id#10535], [started_at#10532 ASC NULLS FIRST]
+- Sort [start_station_id#10535 ASC NULLS FIRST, started_at#10532 ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(start_station_id#10535, 200), ENSURE_REQUIREMENTS, [plan_id=2095]
+- Project [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, cast(start_station_id#10535 as double) AS start_station_id#10655, end_station_name#10536, cast(end_station_id#10537 as double) AS end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, isnotnull(ride_id#10530) AS valid_ride_id#10641, (ended_at#10533 > started_at#10532) AS valid_time#10642, (((isnotnull(end_station_id#10537) AND isnotnull(start_station_id#10535)) AND NOT (end_station_id#10537 = start_station_id#10535)) <=> true) AS valid_station#10643, input_file_name() AS _source_file#10644, year(cast(started_at#10532 as date)) AS year#10648, month(cast(started_at#10532 as date)) AS month#10649, start_station_id#10535, started_at#10532]
+- FileScan csv [ride_id#10530,rideable_type#10531,started_at#10532,ended_at#10533,start_station_name#10534,start_station_id#10535,end_station_name#10536,end_station_id#10537,start_lat#10538,start_lng#10539,end_lat#10540,end_lng#10541,member_casual#10542] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(55 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
,None)
(3) Scan csv
Output [13]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10535, end_station_name#10536, end_station_id#10537, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542]
Batched: false
Location: InMemoryFileIndex [s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibike-tripdata-part00.csv, ... 54 entries]
ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_name:string,start_station_id:string,end_station_name:string,end_station_id:string,start_lat:double,start_lng:double,end_lat:double,end_lng:double,member_casual:string>
(4) Project
Output [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, cast(start_station_id#10535 as double) AS start_station_id#10655, end_station_name#10536, cast(end_station_id#10537 as double) AS end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, isnotnull(ride_id#10530) AS valid_ride_id#10641, (ended_at#10533 > started_at#10532) AS valid_time#10642, (((isnotnull(end_station_id#10537) AND isnotnull(start_station_id#10535)) AND NOT (end_station_id#10537 = start_station_id#10535)) <=> true) AS valid_station#10643, input_file_name() AS _source_file#10644, year(cast(started_at#10532 as date)) AS year#10648, month(cast(started_at#10532 as date)) AS month#10649, start_station_id#10535, started_at#10532]
Input [13]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10535, end_station_name#10536, end_station_id#10537, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542]
(5) Exchange
Input [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, year#10648, month#10649, start_station_id#10535, started_at#10532]
Arguments: hashpartitioning(start_station_id#10535, 200), ENSURE_REQUIREMENTS, [plan_id=2228]
(6) Sort
Input [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, year#10648, month#10649, start_station_id#10535, started_at#10532]
Arguments: [start_station_id#10535 ASC NULLS FIRST, started_at#10532 ASC NULLS FIRST], false, 0
(7) Window
Input [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, year#10648, month#10649, start_station_id#10535, started_at#10532]
Arguments: [row_number() windowspecdefinition(start_station_id#10535, started_at#10532 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#10647], [start_station_id#10535], [started_at#10532 ASC NULLS FIRST]
(8) Project
Output [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, 2026-04-08 09:56:30.312891 AS _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649]
Input [22]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, year#10648, month#10649, start_station_id#10535, started_at#10532, _start_station_ride_num#10647]
(9) AdaptiveSparkPlan
Output [21]: [ride_id#10530, rideable_type#10531, started_at#10532, ended_at#10533, start_station_name#10534, start_station_id#10655, end_station_name#10536, end_station_id#10657, start_lat#10538, start_lng#10539, end_lat#10540, end_lng#10541, member_casual#10542, valid_ride_id#10641, valid_time#10642, valid_station#10643, _source_file#10644, _processed_dttm#10645, _start_station_ride_num#10647, year#10648, month#10649]
Arguments: isFinalPlan=false
(10) TableCacheQueryStage
Output: []
Arguments: 0
(11) HashAggregate [codegen id : 1]
Input: []
Keys: []
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#12197L]
Results [1]: [count#12198L]
(12) Exchange
Input [1]: [count#12198L]
Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=2254]
(13) ShuffleQueryStage
Output [1]: [count#12198L]
Arguments: 1
(14) HashAggregate [codegen id : 2]
Input [1]: [count#12198L]
Keys: []
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#11879L]
Results [1]: [count(1)#11879L AS count#11880L]
(15) HashAggregate
Input: []
Keys: []
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#12197L]
Results [1]: [count#12198L]
(16) Exchange
Input [1]: [count#12198L]
Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=2225]
(17) HashAggregate
Input [1]: [count#12198L]
Keys: []
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#11879L]
Results [1]: [count(1)#11879L AS count#11880L]
(18) AdaptiveSparkPlan
Output [1]: [count#11880L]
Arguments: isFinalPlan=true