diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_10.html b/docs/StardustDocs/resources/api/join/notebook_test_join_10.html new file mode 100644 index 0000000000..e4345f7261 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_10.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_11.html b/docs/StardustDocs/resources/api/join/notebook_test_join_11.html new file mode 100644 index 0000000000..cade392f1b --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_11.html @@ -0,0 +1,514 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_12.html b/docs/StardustDocs/resources/api/join/notebook_test_join_12.html new file mode 100644 index 0000000000..bea117d8c7 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_12.html @@ -0,0 +1,513 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_13.html b/docs/StardustDocs/resources/api/join/notebook_test_join_13.html new file mode 100644 index 0000000000..1f8f389cd5 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_13.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_14.html b/docs/StardustDocs/resources/api/join/notebook_test_join_14.html new file mode 100644 index 0000000000..e4345f7261 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_14.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_15.html b/docs/StardustDocs/resources/api/join/notebook_test_join_15.html new file mode 100644 index 0000000000..bea117d8c7 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_15.html @@ -0,0 +1,513 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_16.html b/docs/StardustDocs/resources/api/join/notebook_test_join_16.html new file mode 100644 index 0000000000..76f60f5259 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_16.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_17.html b/docs/StardustDocs/resources/api/join/notebook_test_join_17.html new file mode 100644 index 0000000000..eec3909749 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_17.html @@ -0,0 +1,513 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_18.html b/docs/StardustDocs/resources/api/join/notebook_test_join_18.html new file mode 100644 index 0000000000..86f0c75100 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_18.html @@ -0,0 +1,513 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_19.html b/docs/StardustDocs/resources/api/join/notebook_test_join_19.html new file mode 100644 index 0000000000..3bae409032 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_19.html @@ -0,0 +1,513 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_20.html b/docs/StardustDocs/resources/api/join/notebook_test_join_20.html new file mode 100644 index 0000000000..0a4899ec25 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_20.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_3.html b/docs/StardustDocs/resources/api/join/notebook_test_join_3.html new file mode 100644 index 0000000000..a13f41d91c --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_3.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_5.html b/docs/StardustDocs/resources/api/join/notebook_test_join_5.html new file mode 100644 index 0000000000..c5b1ba3bf7 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_5.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_6.html b/docs/StardustDocs/resources/api/join/notebook_test_join_6.html new file mode 100644 index 0000000000..515f5a7101 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_6.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/join/notebook_test_join_8.html b/docs/StardustDocs/resources/api/join/notebook_test_join_8.html new file mode 100644 index 0000000000..1f8f389cd5 --- /dev/null +++ b/docs/StardustDocs/resources/api/join/notebook_test_join_8.html @@ -0,0 +1,512 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/topics/_shadow_resources.md b/docs/StardustDocs/topics/_shadow_resources.md index f4c5465027..9868d51963 100644 --- a/docs/StardustDocs/topics/_shadow_resources.md +++ b/docs/StardustDocs/topics/_shadow_resources.md @@ -166,6 +166,21 @@ + + + + + + + + + + + + + + + diff --git a/docs/StardustDocs/topics/join.md b/docs/StardustDocs/topics/join.md deleted file mode 100644 index 1358d67b86..0000000000 --- a/docs/StardustDocs/topics/join.md +++ /dev/null @@ -1,112 +0,0 @@ -[//]: # (title: join) - - - -Joins two [`DataFrame`](DataFrame.md) object by join columns. - -```kotlin -join(otherDf, type = JoinType.Inner) [ { joinColumns } ] - -joinColumns: JoinDsl.(LeftDataFrame) -> Columns - -interface JoinDsl: LeftDataFrame { - - val right: RightDataFrame - - fun DataColumn.match(rightColumn: DataColumn) -} -``` - -`joinColumns` is a [column selector](ColumnSelectors.md) that defines column mapping for join: - -Related operations: [](multipleDataFrames.md) - - - - - -```kotlin -df.join(other) { name match right.fullName } -``` - - - - -```kotlin -df.join(other) { "name" match "fullName" } -``` - - - - - -If mapped columns have the same name, just select join columns from the left [`DataFrame`](DataFrame.md): - - - - - -```kotlin -df.join(other) { name and city } -``` - - - - -```kotlin -df.join(other, "name", "city") -``` - - - - - -If `joinColumns` is not specified, columns with the same name from both [`DataFrame`](DataFrame.md) objects will be used as join columns: - - - -```kotlin -df.join(other) -``` - - - - -### Join types - -Supported join types: -* `Inner` (default) — only matched rows from left and right [`DataFrame`](DataFrame.md) objects -* `Filter` — only matched rows from left [`DataFrame`](DataFrame.md) -* `Left` — all rows from left [`DataFrame`](DataFrame.md), mismatches from right [`DataFrame`](DataFrame.md) filled with `null` -* `Right` — all rows from right [`DataFrame`](DataFrame.md), mismatches from left [`DataFrame`](DataFrame.md) filled with `null` -* `Full` — all rows from left and right [`DataFrame`](DataFrame.md) objects, any mismatches filled with `null` -* `Exclude` — only mismatched rows from left [`DataFrame`](DataFrame.md) - -For every join type there is a shortcut operation: - - - - - -```kotlin -df.innerJoin(other) { name and city } -df.leftJoin(other) { name and city } -df.rightJoin(other) { name and city } -df.fullJoin(other) { name and city } -df.excludeJoin(other) { name and city } -``` - - - - -```kotlin -df.innerJoin(other, "name", "city") -df.leftJoin(other, "name", "city") -df.rightJoin(other, "name", "city") -df.fullJoin(other, "name", "city") -df.excludeJoin(other, "name", "city") -``` - - - - diff --git a/docs/StardustDocs/topics/operations/multiple/join.md b/docs/StardustDocs/topics/operations/multiple/join.md new file mode 100644 index 0000000000..3649480e58 --- /dev/null +++ b/docs/StardustDocs/topics/operations/multiple/join.md @@ -0,0 +1,362 @@ +[//]: # (title: join) + + + +Joins two [`DataFrame`](DataFrame.md) objects by join columns. + +A *join* creates a new dataframe by combining rows from two input dataframes according to one or more key columns. +Rows are merged when the values in the join columns match. +If there is no match, whether the row is included and how missing values are filled depends on the type of join (e.g., inner, left, right, full). + +Returns a new [`DataFrame`](DataFrame.md) that contains the merged rows and columns from both inputs. + +```kotlin +join(otherDf, type = JoinType.Inner) [ { joinColumns } ] + +joinColumns: JoinDsl.(LeftDataFrame) -> Columns + +interface JoinDsl: LeftDataFrame { + + val right: RightDataFrame + + fun DataColumn.match(rightColumn: DataColumn) +} +``` + +`joinColumns` is a [column selector](ColumnSelectors.md) that defines column mapping for join: + +Related operations: [](multipleDataFrames.md) + +## Examples + + + +```kotlin +dfAges +``` + + + + + + + +```kotlin +dfCities +``` + + + + + + + +```kotlin +// INNER JOIN on differently named keys: +// Merge a row when dfAges.firstName == dfCities.name. +// With the given data all 3 names match → all rows merge. +dfAges.join(dfCities) { firstName match right.name } +``` + + + + + +If mapped columns have the same name, just select join columns from the left [`DataFrame`](DataFrame.md): + + + +```kotlin +dfLeft +``` + + + + + + + + +```kotlin +dfRight +``` + + + + + + + +```kotlin +// INNER JOIN on "name" only: +// Merge when left.name == right.name. +// Duplicate keys produce multiple merged rows (one per pairing). +dfLeft.join(dfRight) { name } +``` + + + + + +> In this example, the "city" columns from the left and right dataframes do not match to each other. +> After joining, the "city" column from the right dataframe is included into result dataframe +> with the name **"city1"** to avoid a name conflict. +> { style = "note" } + + +If `joinColumns` is not specified, columns with the same name from both [`DataFrame`](DataFrame.md) +objects will be used as join columns: + + + + +```kotlin +// INNER JOIN on all same-named columns ("name" and "city"): +// Merge when BOTH name AND city are equal; otherwise the row is dropped. +dfLeft.join(dfRight) +``` + + + + + + +## Join types + +Supported join types: +* `Inner` (default) — only matched rows from left and right [`DataFrame`](DataFrame.md) objects +* `Filter` — only matched rows from left [`DataFrame`](DataFrame.md) +* `Left` — all rows from left [`DataFrame`](DataFrame.md), mismatches from right [`DataFrame`](DataFrame.md) filled with `null` +* `Right` — all rows from right [`DataFrame`](DataFrame.md), mismatches from left [`DataFrame`](DataFrame.md) filled with `null` +* `Full` — all rows from left and right [`DataFrame`](DataFrame.md) objects, any mismatches filled with `null` +* `Exclude` — only mismatched rows from left [`DataFrame`](DataFrame.md) + +For every join type there is a shortcut operation: + +```kotlin +df.innerJoin(otherDf) [ { joinColumns } ] +df.filterJoin(otherDf) [ { joinColumns } ] +df.leftJoin(otherDf) [ { joinColumns } ] +df.rightJoin(otherDf) [ { joinColumns } ] +df.fullJoin(otherDf) [ { joinColumns } ] +df.excludeJoin(otherDf) [ { joinColumns } ] +``` + + +### Examples {id="examples_1"} + +#### Inner {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + +```kotlin +// INNER JOIN: +// Keep only rows where (name, city) match on both sides. +// In this dataset both Charlies match twice (Moscow, Milan) → 2 merged rows. +dfLeft.innerJoin(dfRight) { name and city } +``` + + + + + +#### Filter {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + + +```kotlin +// FILTER JOIN: +// Keep ONLY left rows that have ANY match on (name, city). +// No right-side columns are added. +dfLeft.filterJoin(dfRight) { name and city } +``` + + + + + +#### Left {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + + +```kotlin +// LEFT JOIN: +// Keep ALL left rows. If (name, city) matches, attach right columns; +// if not, right columns are null (e.g., Alice–London has no right match). +dfLeft.leftJoin(dfRight) { name and city } +``` + + + + + +#### Right {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + + +```kotlin +// RIGHT JOIN: +// Keep ALL right rows. If no left match, left columns become null +// (e.g., Alice with city=null exists only on the right). +dfLeft.rightJoin(dfRight) { name and city } +``` + + + + + +#### Full {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + + +```kotlin +// FULL JOIN: +// Keep ALL rows from both sides. Where there's no match on (name, city), +// the other side is filled with nulls. +dfLeft.fullJoin(dfRight) { name and city } +``` + + + + + +#### Exclude {collapsible="true"} + + + +```kotlin +dfLeft +``` + + + + + + + +```kotlin +dfRight +``` + + + + + + + + +```kotlin +// EXCLUDE JOIN: +// Keep ONLY left rows that have NO match on (name, city). +// Useful to find "unpaired" left rows. +dfLeft.excludeJoin(dfRight) { name and city } +``` + + + + + diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/multiple/JoinSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/multiple/JoinSamples.kt new file mode 100644 index 0000000000..b571c07b3d --- /dev/null +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/multiple/JoinSamples.kt @@ -0,0 +1,272 @@ +package org.jetbrains.kotlinx.dataframe.samples.api.multiple + +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.RgbColor +import org.jetbrains.kotlinx.dataframe.api.and +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.excludeJoin +import org.jetbrains.kotlinx.dataframe.api.filterJoin +import org.jetbrains.kotlinx.dataframe.api.format +import org.jetbrains.kotlinx.dataframe.api.fullJoin +import org.jetbrains.kotlinx.dataframe.api.innerJoin +import org.jetbrains.kotlinx.dataframe.api.join +import org.jetbrains.kotlinx.dataframe.api.leftJoin +import org.jetbrains.kotlinx.dataframe.api.perRowCol +import org.jetbrains.kotlinx.dataframe.api.rightJoin +import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper +import org.jetbrains.kotlinx.kandy.letsplot.style.LayoutParameters.Companion.background +import org.junit.Test + +class JoinSamples : DataFrameSampleHelper("join", "api") { + + @DataSchema + interface DfAges { + val age: Int + val firstName: String + } + + private val dfAges = dataFrameOf( + "firstName" to listOf("Alice", "Bob", "Charlie"), + "age" to listOf(14, 45, 20), + ).cast() + + @DataSchema + interface DfCities { + val city: String + val name: String + } + + private val dfCities = dataFrameOf( + "name" to listOf("Bob", "Alice", "Charlie"), + "city" to listOf("London", "Dubai", "Moscow"), + ).cast() + + @DataSchema + interface DfWithNameAndCity { + val name: String + val city: String? + } + + @DataSchema + interface DfLeft : DfWithNameAndCity { + val age: Int + override val city: String + override val name: String + } + + private val dfLeft = dataFrameOf( + "name" to listOf("Alice", "Bob", "Charlie", "Charlie"), + "age" to listOf(15, 45, 20, 40), + "city" to listOf("London", "Dubai", "Moscow", "Tokyo"), + ).cast() + + @DataSchema + interface DfRight : DfWithNameAndCity { + override val city: String? + val isBusy: Boolean + override val name: String + } + + private val dfRight = dataFrameOf( + "name" to listOf("Alice", "Bob", "Alice", "Charlie"), + "isBusy" to listOf(true, false, true, true), + "city" to listOf("London", "Tokyo", null, "Moscow"), + ).cast() + + private fun nameToColor(name: String): RgbColor = + when (name) { + "Alice" -> RgbColor(189, 206, 233) + "Bob" -> RgbColor(198, 224, 198) + "Charlie" -> RgbColor(219, 198, 230) + else -> RgbColor(255, 255, 255) + } + + private fun nameAndCityToColor(name: String, city: String?): RgbColor = + when (name to city) { + "Alice" to "London" -> RgbColor(242, 210, 189) + "Bob" to "Dubai" -> RgbColor(245, 226, 191) + "Charlie" to "Moscow" -> RgbColor(210, 229, 199) + "Charlie" to "Tokyo" -> RgbColor(191, 223, 232) + "Bob" to "Tokyo" -> RgbColor(200, 200, 232) + "Alice" to null -> RgbColor(233, 199, 220) + else -> RgbColor(255, 255, 255) + } + + fun DataFrame.colorized() = + format().perRowCol { row, _ -> + val color = nameAndCityToColor(row.name, row.city) + background(color) and textColor(black) + } + + @Test + fun notebook_test_join_3() { + // SampleStart + dfAges + // SampleEnd + .format().perRowCol { row, _ -> + val color = nameToColor(row.firstName) + background(color) and textColor(black) + } + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_5() { + // SampleStart + dfCities + // SampleEnd + .format().perRowCol { row, _ -> + val color = nameToColor(row.name) + background(color) and textColor(black) + } + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_6() { + // SampleStart + // INNER JOIN on differently named keys: + // Merge a row when dfAges.firstName == dfCities.name. + // With the given data all 3 names match → all rows merge. + dfAges.join(dfCities) { firstName match right.name } + // SampleEnd + .format().perRowCol { row, _ -> + val color = nameToColor(row.firstName) + background(color) and textColor(black) + } + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_8() { + // SampleStart + dfLeft + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_10() { + // SampleStart + dfRight + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_11() { + // SampleStart + // INNER JOIN on "name" only: + // Merge when left.name == right.name. + // Duplicate keys produce multiple merged rows (one per pairing). + dfLeft.join(dfRight) { name } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_12() { + // SampleStart + // INNER JOIN on all same-named columns ("name" and "city"): + // Merge when BOTH name AND city are equal; otherwise the row is dropped. + dfLeft.join(dfRight) + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_13() { + // SampleStart + dfLeft + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_14() { + // SampleStart + dfRight + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_15() { + // SampleStart + // INNER JOIN: + // Keep only rows where (name, city) match on both sides. + // In this dataset both Charlies match twice (Moscow, Milan) → 2 merged rows. + dfLeft.innerJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_16() { + // SampleStart + // FILTER JOIN: + // Keep ONLY left rows that have ANY match on (name, city). + // No right-side columns are added. + dfLeft.filterJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_17() { + // SampleStart + // LEFT JOIN: + // Keep ALL left rows. If (name, city) matches, attach right columns; + // if not, right columns are null (e.g., Alice–London has no right match). + dfLeft.leftJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_18() { + // SampleStart + // RIGHT JOIN: + // Keep ALL right rows. If no left match, left columns become null + // (e.g., Alice with city=null exists only on the right). + dfLeft.rightJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_19() { + // SampleStart + // FULL JOIN: + // Keep ALL rows from both sides. Where there's no match on (name, city), + // the other side is filled with nulls. + dfLeft.fullJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } + + @Test + fun notebook_test_join_20() { + // SampleStart + // EXCLUDE JOIN: + // Keep ONLY left rows that have NO match on (name, city). + // Useful to find "unpaired" left rows. + dfLeft.excludeJoin(dfRight) { name and city } + // SampleEnd + .colorized() + .saveDfHtmlSample() + } +}