Combine columns from two datasets¶
The cbind
function lets you combine datasets by adding columns from one dataset into another. If the datasets contain common column names, H2O will append the joined column with 0
.
Note
The two datasets you are combining columns from must have the same number of rows.
import h2o
h2o.init()
import numpy as np
# Generate a random dataset with 10 rows 4 columns.
# Label the columns A, B, C, and D.
cols1_df = h2o.H2OFrame.from_python(np.random.randn(10,4).tolist(), column_names=list('ABCD'))
cols1_df.describe
A B C D
--------- --------- --------- ----------
0.660737 -1.11679 0.278233 -0.0326621
-0.124613 -0.668794 0.558957 1.11402
0.944408 -1.6397 0.616223 0.137581
0.739501 0.671192 0.715497 -0.361146
1.52177 0.232701 0.196153 0.499426
-1.48407 0.222175 2.45155 -0.470239
0.880962 0.906569 -0.767418 1.38261
0.509212 0.602155 1.41956 1.96045
1.11071 0.779309 1.77455 -0.400746
-0.881062 -0.897391 0.980548 -0.266982
[10 rows x 4 columns]
# Generate a second random dataset with 10 rows and 2 columns.
# Label the columns, Y and Z.
cols2_df = h2o.H2OFrame.from_python(np.random.randn(10,2).tolist(), column_names=list('YZ'))
cols2_df.describe
Y Z
---------- ----------
0.54945 0.0283338
1.27367 -1.46298
0.875547 0.317876
2.12603 0.371443
0.662796 1.0291
-0.267864 0.86477
-1.51065 0.71466
0.0676983 -0.844925
0.311779 0.0397941
0.363517 0.465146
[10 rows x 2 columns]
# Add the columns from the second dataset into the first.
# H2O will append these as the right-most columns.
colsCombine_df = cols1_df.cbind(cols2_df)
colsCombine_df.describe
A B C D Y Z
--------- --------- --------- ---------- ---------- ----------
0.660737 -1.11679 0.278233 -0.0326621 0.54945 0.0283338
-0.124613 -0.668794 0.558957 1.11402 1.27367 -1.46298
0.944408 -1.6397 0.616223 0.137581 0.875547 0.317876
0.739501 0.671192 0.715497 -0.361146 2.12603 0.371443
1.52177 0.232701 0.196153 0.499426 0.662796 1.0291
-1.48407 0.222175 2.45155 -0.470239 -0.267864 0.86477
0.880962 0.906569 -0.767418 1.38261 -1.51065 0.71466
0.509212 0.602155 1.41956 1.96045 0.0676983 -0.844925
1.11071 0.779309 1.77455 -0.400746 0.311779 0.0397941
-0.881062 -0.897391 0.980548 -0.266982 0.363517 0.465146
[10 rows x 6 columns]
library(h2o)
h2o.init()
# Create two simple, two-column R data frames by inputting values,
# ensuring that both have a common column (in this case, "fruit").
left <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'blueberry'),
color = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue'))
right <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'watermelon'),
citrus = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE))
# Create the H2O data frames from the inputted data.
left_frame <- as.h2o(left)
print(left_frame)
fruit color
1 apple red
2 orange orange
3 banana yellow
4 lemon yellow
5 strawberry red
6 blueberry blue
[6 rows x 2 columns]
right_frame <- as.h2o(right)
print(right_frame)
fruit citrus
1 apple FALSE
2 orange TRUE
3 banana FALSE
4 lemon TRUE
5 strawberry FALSE
6 watermelon FALSE
[6 rows x 2 columns]
# Combine the l.hex and r.hex datasets into a single dataset.
# The columns from r.hex will be appended to the right side of the final dataset.
# In addition, because both datasets include a "fruit" column, H2O will append the
# second "fruit" column name with "0". Note that this is different than ``merge``,
# which combines data from two commonly named columns in two datasets.
columns <- h2o.cbind(left_frame, right_frame)
print(columns)
fruit color fruit0 citrus
1 apple red apple FALSE
2 orange orange orange TRUE
3 banana yellow banana FALSE
4 lemon yellow lemon TRUE
5 strawberry red strawberry FALSE
6 blueberry blue watermelon FALSE
[6 rows x 4 columns]