March 16, 2021
Just like Checkpoint 1, but now...
'Sort ['bar ASC NULLS FIRST], true
+- 'Project [*]
+- 'UnresolvedRelation [FOO], [], false
You can rely on Scala's native sort.
case class Sort(
order: Seq[SortOrder], // Order clauses
global: Boolean, // ignore this (for distributed execution)
child: LogicalPlan // Input plan
) extends UnaryNode with Product with Serializable
case class SortOrder(
child: Expression, // The expression to sort by
direction: SortDirection, // Ascending or Descending
nullOrdering: NullOrdering, // NullsFirst or NullsLast
sameOrderExpressions: Seq[Expression] // ignore this (used by spark)
)
SELECT * FROM R ORDER BY A ASC, B DESC
Sort(Seq(
SortOrder(UnresolvedAttribute(Seq("A")),
Ascending,
Ascending.defaultNullOrder,
Seq()),
SortOrder(UnresolvedAttribute(Seq("B")),
Descending,
Descending.defaultNullOrder,
Seq())
), true, UnresolvedRelation(Seq("R"))),
))
'GlobalLimit 3
+- 'LocalLimit 3
+- 'Project [*]
+- 'UnresolvedRelation [FOO], [], false
Output exactly limitExpr rows if available
case class GlobalLimit(
limitExpr: Expression,
child: LogicalPlan
) extends OrderPreservingUnaryNode with Product with Serializable
Output exactly limitExpr rows if available per partition
case class LocalLimit(
limitExpr: Expression,
child: LogicalPlan
) extends OrderPreservingUnaryNode with Product with Serializable
For this checkpoint: Pick one, ignore the other
SELECT * FROM R LIMIT 3
GlobalLimit(
Literal(3),
LocalLimit(
Literal(3),
UnresolvedRelation(Seq("R"))
)
)
$\sigma_{c_1 \wedge c_2 \wedge c_3}(R \times S) = \sigma_{c_1}(R) \bowtie_{c_2} \sigma_{c_3}(S)$
plan transform {
case Filter(condition,
Project(expressions, child)) => ???
}
plan transform {
case Filter(condition,
Join(lhs, rhs, Cross, condition, hint)) => ???
}
trait OptimizationRule {
def apply(plan: LogicalPlan): LogicalPlan
}
object PushDownSelections
extends OptimizationRule
{
def apply(plan: LogicalPlan) =
plan.transform {
case Filter(condition,
Project(expressions, child)) => ???
/* and other cases here... */
}
}
Test whether two logical plans are the same with fastEquals
Watch out for infinite loops.
val rules = Seq[OptimizationRule]( ??? )
def onePass(plan: LogicalPlan) =
{ val current = plan
for(rule <- rules){ current = rule.apply(current) }
def fixpoint(plan: LogicalPlan) =
{ var current = plan
var last = null
while(last == null || !current.fastEquals(last)){
last = current
current = onePass(current)
}
return current
}
The reference implementation uses Scala's Map
.
Keep in-mind that you may need to hash multiple tuples to the same join key.
Cross product is expensive!
Can we do better?
$\sigma_c(R\times S) \equiv R\bowtie_c S$
Problem: Naively, any tuple matches any other
Solution: First organize the data
$$h(X) \mod N$$
Repeat Pass 2 As Needed.