digraph G {
0 [labelType="html" label="<br><b>AdaptiveSparkPlan</b><br><br>"];
subgraph cluster1 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: 0 ms";
2 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build: 0 ms<br>number of output rows: 1"];
}
3 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 2<br>local merged chunks fetched: 0<br>shuffle write time total (min, med, max (stageId: taskId))<br>1 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>remote merged bytes read: 0.0 B<br>local merged blocks fetched: 0<br>corrupt merged block chunks: 0<br>remote merged reqs duration: 0 ms<br>remote merged blocks fetched: 0<br>records read: 2<br>local bytes read: 118.0 B<br>fetch wait time: 0 ms<br>remote bytes read: 0.0 B<br>merged fetch fallback count: 0<br>local blocks read: 2<br>remote merged chunks fetched: 0<br>remote blocks read: 0<br>data size total (min, med, max (stageId: taskId))<br>32.0 B (0.0 B, 16.0 B, 16.0 B (stage 77.0: task 61))<br>local merged bytes read: 0.0 B<br>number of partitions: 1<br>remote reqs duration: 0 ms<br>remote bytes read to disk: 0.0 B<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>118.0 B (0.0 B, 59.0 B, 59.0 B (stage 77.0: task 61))"];
subgraph cluster4 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n18 ms (9 ms, 9 ms, 9 ms (stage 77.0: task 61))";
5 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>17 ms (8 ms, 9 ms, 9 ms (stage 77.0: task 62))<br>number of output rows: 2"];
6 [labelType="html" label="<br><b>Project</b><br><br>"];
7 [labelType="html" label="<b>Filter</b><br><br>number of output rows: 483"];
}
8 [labelType="html" label="<b>InMemoryTableScan</b><br><br>number of output rows: 31,257"];
9 [labelType="html" label="<br><b>AdaptiveSparkPlan</b><br><br>"];
subgraph cluster10 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))";
11 [labelType="html" label="<br><b>Project</b><br><br>"];
}
12 [labelType="html" label="<b>Window</b><br><br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))"];
subgraph cluster13 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))";
14 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>peak memory total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))"];
}
15 [labelType="html" label="<br><b>AQEShuffleRead</b><br><br>"];
16 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 0<br>local merged chunks fetched: 0<br>shuffle write time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>remote merged bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>local merged blocks fetched: 0<br>corrupt merged block chunks: 0<br>remote merged reqs duration total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>remote merged blocks fetched: 0<br>records read: 0<br>local bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>fetch wait time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>remote bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>merged fetch fallback count: 0<br>local blocks read: 0<br>remote merged chunks fetched: 0<br>remote blocks read: 0<br>data size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>local merged bytes read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>number of partitions: 0<br>remote reqs duration total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>remote bytes read to disk total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))"];
subgraph cluster17 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))";
18 [labelType="html" label="<br><b>Project</b><br><br>"];
}
19 [labelType="html" label="<b>Scan csv </b><br><br>number of output rows: 0<br>number of files read: 0<br>metadata time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 77.0: task 61))<br>size of files read total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 77.0: task 61))"];
2->0;
3->2;
5->3;
6->5;
7->6;
8->7;
9->8;
11->9;
12->11;
14->12;
15->14;
16->15;
18->16;
19->18;
}
20
AdaptiveSparkPlan isFinalPlan=true
HashAggregate(keys=[], functions=[count(1)])
WholeStageCodegen (2)
Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=931]
HashAggregate(keys=[], functions=[partial_count(1)])
Project
Filter ((NOT valid_ride_id#5038 OR NOT valid_time#5039) OR NOT coalesce(valid_station#5040, false))
WholeStageCodegen (1)
InMemoryTableScan [valid_ride_id#5038, valid_station#5040, valid_time#5039], [((NOT valid_ride_id#5038 OR NOT valid_time#5039) OR NOT coalesce(valid_station#5040, false))]
AdaptiveSparkPlan isFinalPlan=true
Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, 2026-04-08 08:20:23.917168 AS _processed_dttm#349, _start_station_ride_num#351, year#352, month#353]
WholeStageCodegen (3)
Window [row_number() windowspecdefinition(start_station_id#257, started_at#254 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#351], [start_station_id#257], [started_at#254 ASC NULLS FIRST]
Sort [start_station_id#257 ASC NULLS FIRST, started_at#254 ASC NULLS FIRST], false, 0
WholeStageCodegen (2)
AQEShuffleRead coalesced
Exchange hashpartitioning(start_station_id#257, 200), ENSURE_REQUIREMENTS, [plan_id=151]
Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, cast(start_station_id#257 as double) AS start_station_id#359, end_station_name#258, cast(end_station_id#259 as double) AS end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, isnotnull(ride_id#252) AS valid_ride_id#345, (ended_at#255 > started_at#254) AS valid_time#346, NOT (end_station_id#259 = start_station_id#257) AS valid_station#347, input_file_name() AS _source_file#348, year(cast(started_at#254 as date)) AS year#352, month(cast(started_at#254 as date)) AS month#353, start_station_id#257, started_at#254]
WholeStageCodegen (1)
FileScan csv [ride_id#252,rideable_type#253,started_at#254,ended_at#255,start_station_name#256,start_station_id#257,end_station_name#258,end_station_id#259,start_lat#260,start_lng#261,end_lat#262,end_lng#263,member_casual#264] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibik..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
== Physical Plan ==
AdaptiveSparkPlan (22)
+- == Final Plan ==
* HashAggregate (16)
+- ShuffleQueryStage (15), Statistics(sizeInBytes=32.0 B, rowCount=2)
+- Exchange (14)
+- * HashAggregate (13)
+- * Project (12)
+- * Filter (11)
+- TableCacheQueryStage (10), Statistics(sizeInBytes=8.2 MiB, rowCount=3.13E+4)
+- InMemoryTableScan (1)
+- InMemoryRelation (2)
+- AdaptiveSparkPlan (9)
+- Project (8)
+- Window (7)
+- Sort (6)
+- Exchange (5)
+- Project (4)
+- Scan csv (3)
+- == Initial Plan ==
HashAggregate (21)
+- Exchange (20)
+- HashAggregate (19)
+- Project (18)
+- Filter (17)
+- InMemoryTableScan (1)
+- InMemoryRelation (2)
+- AdaptiveSparkPlan (9)
+- Project (8)
+- Window (7)
+- Sort (6)
+- Exchange (5)
+- Project (4)
+- Scan csv (3)
(1) InMemoryTableScan
Output [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
Arguments: [valid_ride_id#5038, valid_station#5040, valid_time#5039], [((NOT valid_ride_id#5038 OR NOT valid_time#5039) OR NOT coalesce(valid_station#5040, false))]
(2) InMemoryRelation
Arguments: [ride_id#5047, rideable_type#5048, started_at#5049, ended_at#5050, start_station_name#5051, start_station_id#5052, end_station_name#5053, end_station_id#5054, start_lat#5055, start_lng#5056, end_lat#5057, end_lng#5058, member_casual#5059, valid_ride_id#5038, valid_time#5039, valid_station#5040, _source_file#5041, _processed_dttm#5042, _start_station_ride_num#5044, year#5045, month#5046], CachedRDDBuilder(org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer@43d79ed6,StorageLevel(disk, memory, deserialized, 1 replicas),AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
*(3) Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, 2026-04-08 08:20:23.917168 AS _processed_dttm#349, _start_station_ride_num#351, year#352, month#353]
+- Window [row_number() windowspecdefinition(start_station_id#257, started_at#254 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#351], [start_station_id#257], [started_at#254 ASC NULLS FIRST]
+- *(2) Sort [start_station_id#257 ASC NULLS FIRST, started_at#254 ASC NULLS FIRST], false, 0
+- AQEShuffleRead coalesced
+- ShuffleQueryStage 0
+- Exchange hashpartitioning(start_station_id#257, 200), ENSURE_REQUIREMENTS, [plan_id=151]
+- *(1) Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, cast(start_station_id#257 as double) AS start_station_id#359, end_station_name#258, cast(end_station_id#259 as double) AS end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, isnotnull(ride_id#252) AS valid_ride_id#345, (ended_at#255 > started_at#254) AS valid_time#346, NOT (end_station_id#259 = start_station_id#257) AS valid_station#347, input_file_name() AS _source_file#348, year(cast(started_at#254 as date)) AS year#352, month(cast(started_at#254 as date)) AS month#353, start_station_id#257, started_at#254]
+- FileScan csv [ride_id#252,rideable_type#253,started_at#254,ended_at#255,start_station_name#256,start_station_id#257,end_station_name#258,end_station_id#259,start_lat#260,start_lng#261,end_lat#262,end_lng#263,member_casual#264] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibik..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
+- == Initial Plan ==
Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, 2026-04-08 08:20:23.917168 AS _processed_dttm#349, _start_station_ride_num#351, year#352, month#353]
+- Window [row_number() windowspecdefinition(start_station_id#257, started_at#254 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#351], [start_station_id#257], [started_at#254 ASC NULLS FIRST]
+- Sort [start_station_id#257 ASC NULLS FIRST, started_at#254 ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(start_station_id#257, 200), ENSURE_REQUIREMENTS, [plan_id=113]
+- Project [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, cast(start_station_id#257 as double) AS start_station_id#359, end_station_name#258, cast(end_station_id#259 as double) AS end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, isnotnull(ride_id#252) AS valid_ride_id#345, (ended_at#255 > started_at#254) AS valid_time#346, NOT (end_station_id#259 = start_station_id#257) AS valid_station#347, input_file_name() AS _source_file#348, year(cast(started_at#254 as date)) AS year#352, month(cast(started_at#254 as date)) AS month#353, start_station_id#257, started_at#254]
+- FileScan csv [ride_id#252,rideable_type#253,started_at#254,ended_at#255,start_station_name#256,start_station_id#257,end_station_name#258,end_station_id#259,start_lat#260,start_lng#261,end_lat#262,end_lng#263,member_casual#264] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibik..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_...
,None)
(3) Scan csv
Output [13]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#257, end_station_name#258, end_station_id#259, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264]
Batched: false
Location: InMemoryFileIndex [s3a://rzvde-g8-kirsanov-dmitry/raw/citibike_data/202502/202502-citibike-tripdata-part00.csv]
ReadSchema: struct<ride_id:string,rideable_type:string,started_at:timestamp,ended_at:timestamp,start_station_name:string,start_station_id:string,end_station_name:string,end_station_id:string,start_lat:double,start_lng:double,end_lat:double,end_lng:double,member_casual:string>
(4) Project
Output [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, cast(start_station_id#257 as double) AS start_station_id#359, end_station_name#258, cast(end_station_id#259 as double) AS end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, isnotnull(ride_id#252) AS valid_ride_id#345, (ended_at#255 > started_at#254) AS valid_time#346, NOT (end_station_id#259 = start_station_id#257) AS valid_station#347, input_file_name() AS _source_file#348, year(cast(started_at#254 as date)) AS year#352, month(cast(started_at#254 as date)) AS month#353, start_station_id#257, started_at#254]
Input [13]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#257, end_station_name#258, end_station_id#259, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264]
(5) Exchange
Input [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, year#352, month#353, start_station_id#257, started_at#254]
Arguments: hashpartitioning(start_station_id#257, 200), ENSURE_REQUIREMENTS, [plan_id=895]
(6) Sort
Input [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, year#352, month#353, start_station_id#257, started_at#254]
Arguments: [start_station_id#257 ASC NULLS FIRST, started_at#254 ASC NULLS FIRST], false, 0
(7) Window
Input [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, year#352, month#353, start_station_id#257, started_at#254]
Arguments: [row_number() windowspecdefinition(start_station_id#257, started_at#254 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _start_station_ride_num#351], [start_station_id#257], [started_at#254 ASC NULLS FIRST]
(8) Project
Output [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, 2026-04-08 08:20:23.917168 AS _processed_dttm#349, _start_station_ride_num#351, year#352, month#353]
Input [22]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, year#352, month#353, start_station_id#257, started_at#254, _start_station_ride_num#351]
(9) AdaptiveSparkPlan
Output [21]: [ride_id#252, rideable_type#253, started_at#254, ended_at#255, start_station_name#256, start_station_id#359, end_station_name#258, end_station_id#361, start_lat#260, start_lng#261, end_lat#262, end_lng#263, member_casual#264, valid_ride_id#345, valid_time#346, valid_station#347, _source_file#348, _processed_dttm#349, _start_station_ride_num#351, year#352, month#353]
Arguments: isFinalPlan=false
(10) TableCacheQueryStage
Output [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
Arguments: 0
(11) Filter [codegen id : 1]
Input [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
Condition : ((NOT valid_ride_id#5038 OR NOT valid_time#5039) OR NOT coalesce(valid_station#5040, false))
(12) Project [codegen id : 1]
Output: []
Input [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
(13) HashAggregate [codegen id : 1]
Input: []
Keys: []
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#8746L]
Results [1]: [count#8747L]
(14) Exchange
Input [1]: [count#8747L]
Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=931]
(15) ShuffleQueryStage
Output [1]: [count#8747L]
Arguments: 1
(16) HashAggregate [codegen id : 2]
Input [1]: [count#8747L]
Keys: []
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#8428L]
Results [1]: [count(1)#8428L AS count#8429L]
(17) Filter
Input [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
Condition : ((NOT valid_ride_id#5038 OR NOT valid_time#5039) OR NOT coalesce(valid_station#5040, false))
(18) Project
Output: []
Input [3]: [valid_ride_id#5038, valid_station#5040, valid_time#5039]
(19) HashAggregate
Input: []
Keys: []
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#8746L]
Results [1]: [count#8747L]
(20) Exchange
Input [1]: [count#8747L]
Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=892]
(21) HashAggregate
Input [1]: [count#8747L]
Keys: []
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#8428L]
Results [1]: [count(1)#8428L AS count#8429L]
(22) AdaptiveSparkPlan
Output [1]: [count#8429L]
Arguments: isFinalPlan=true