Source code for h2o.frame

# -*- encoding: utf-8 -*-
"""
H2O data frame.

:copyright: (c) 2016 H2O.ai
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import csv
import datetime
import functools
import os
import sys
import tempfile
import traceback
import warnings
from io import StringIO
from types import FunctionType

import h2o
from h2o.base import Keyed
from h2o.display import H2ODisplay
from h2o.exceptions import H2OTypeError, H2OValueError
from h2o.expr import ExprNode
from h2o.group_by import GroupBy
from h2o.job import H2OJob
from h2o.utils.compatibility import *  # NOQA
from h2o.utils.compatibility import viewitems, viewvalues
from h2o.utils.config import get_config_value
from h2o.utils.shared_utils import (_handle_numpy_array, _handle_pandas_data_frame, _handle_python_dicts,
                                    _handle_python_lists, _is_list, _is_str_list, _py_tmp_key, _quoted,
                                    can_use_pandas, quote, normalize_slice, slice_is_normalized, check_frame_id)
from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray,
                                  numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U)

__all__ = ("H2OFrame", )


[docs]class H2OFrame(Keyed): """ Primary data store for H2O. H2OFrame is similar to pandas' ``DataFrame``, or R's ``data.frame``. One of the critical distinction is that the data is generally not held in memory, instead it is located on a (possibly remote) H2O cluster, and thus ``H2OFrame`` represents a mere handle to that data. Create a new H2OFrame object, possibly from some other object. :param python_obj: object that will be converted to an ``H2OFrame``. This could have multiple types: - None: create an empty H2OFrame - A list/tuple of strings or numbers: create a single-column H2OFrame containing the contents of this list. - A dictionary of ``{name: list}`` pairs: create an H2OFrame with multiple columns, each column having the provided ``name`` and contents from ``list``. If the source dictionary is not an OrderedDict, then the columns in the H2OFrame may appear shuffled. - A list of lists of strings/numbers: construct an H2OFrame from a rectangular table of values, with inner lists treated as rows of the table. I.e. ``H2OFrame([[1, 'a'], [2, 'b'], [3, 'c']])`` will create a frame with 3 rows and 2 columns, one numeric and one string. - A Pandas dataframe, or a Numpy ndarray: create a matching H2OFrame. - A Scipy sparse matrix: create a matching sparse H2OFrame. :param int header: if ``python_obj`` is a list of lists, this parameter can be used to indicate whether the first row of the data represents headers. The value of -1 means the first row is data, +1 means the first row is the headers, 0 (default) allows H2O to guess whether the first row contains data or headers. :param List[str] column_names: explicit list of column names for the new H2OFrame. This will override any column names derived from the data. If the python_obj does not contain explicit column names, and this parameter is not given, then the columns will be named "C1", "C2", "C3", etc. :param column_types: explicit column types for the new H2OFrame. This could be either a list of types for each column, or a dictionary of {column name: column type} pairs. In the latter case you may override types for only few columns, and let H2O choose the types of the rest. :param na_strings: List of strings in the input data that should be interpreted as missing values. This could be given on a per-column basis, either as a list-of-lists, or as a dictionary {column name: list of nas}. :param str destination_frame: (internal) name of the target DKV key in the H2O backend. :param str separator: (deprecated) :example: >>> python_obj = [1, 2, 2.5, -100.9, 0] >>> frame = h2o.H2OFrame(python_obj) >>> pyunit_utils.check_dims_values(python_obj, the_frame, rows=5, cols=1) """ # Temp flag: set this to false for now if encountering path conversion/expansion issues when import files to remote server __LOCAL_EXPANSION_ON_SINGLE_IMPORT__ = True #------------------------------------------------------------------------------------------------------------------- # Construction #------------------------------------------------------------------------------------------------------------------- def __init__(self, python_obj=None, destination_frame=None, header=0, separator=",", column_names=None, column_types=None, na_strings=None, skipped_columns=None): coltype = U(None, "unknown", "uuid", "string", "float", "real", "double", "int", "numeric", "categorical", "factor", "enum", "time") assert_is_type(python_obj, None, list, tuple, dict, numpy_ndarray, pandas_dataframe, scipy_sparse) assert_is_type(destination_frame, None, str) assert_is_type(header, -1, 0, 1) assert_is_type(separator, I(str, lambda s: len(s) == 1)) assert_is_type(column_names, None, [str]) assert_is_type(column_types, None, [coltype], {str: coltype}) assert_is_type(na_strings, None, [str], [[str]], {str: [str]}) check_frame_id(destination_frame) self._ex = ExprNode() self._ex._children = None self._is_frame = True # Indicate that this is an actual frame, allowing typechecks to be made if python_obj is not None: self._upload_python_object(python_obj, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns) @staticmethod def _expr(expr, cache=None): # TODO: merge this method with `__init__` fr = H2OFrame() fr._ex = expr if cache is not None: fr._ex._cache.fill_from(cache) return fr def _upload_python_object(self, python_obj, destination_frame=None, header=0, separator=",", column_names=None, column_types=None, na_strings=None, skipped_columns=None): assert_is_type(python_obj, list, tuple, dict, numpy_ndarray, pandas_dataframe, scipy_sparse) if is_type(python_obj, scipy_sparse): self._upload_sparse_matrix(python_obj, destination_frame=destination_frame) return # TODO: all these _handlers should really belong to this class, not to shared_utils. processor = (_handle_pandas_data_frame if is_type(python_obj, pandas_dataframe) else _handle_numpy_array if is_type(python_obj, numpy_ndarray) else _handle_python_dicts if is_type(python_obj, dict) else _handle_python_lists) col_header, data_to_write = processor(python_obj, header) if col_header is None or data_to_write is None: raise H2OValueError("No data to write") if not column_names: column_names = col_header # create a temporary file that will be written to tmp_handle, tmp_path = tempfile.mkstemp(suffix=".csv") tmp_file = os.fdopen(tmp_handle, 'w') # create a new csv writer object thingy csv_writer = csv.writer(tmp_file, dialect="excel", quoting=csv.QUOTE_NONNUMERIC) csv_writer.writerow(column_names) if data_to_write and isinstance(data_to_write[0], dict): for row in data_to_write: csv_writer.writerow([row.get(k, None) for k in col_header]) else: csv_writer.writerows(data_to_write) tmp_file.close() # close the streams self._upload_parse(tmp_path, destination_frame, 1, separator, column_names, column_types, na_strings, skipped_columns) os.remove(tmp_path) # delete the tmp file def _upload_sparse_matrix(self, matrix, destination_frame=None): import scipy.sparse as sp if not sp.issparse(matrix): raise H2OValueError("A sparse matrix expected, got %s" % type(matrix)) tmp_handle, tmp_path = tempfile.mkstemp(suffix=".svmlight") out = os.fdopen(tmp_handle, "wt") if destination_frame is None: destination_frame = _py_tmp_key(h2o.connection().session_id) # sp.find(matrix) returns (row indices, column indices, values) of the non-zero elements of A. Unfortunately # there is no guarantee that those elements are returned in the correct order, so need to sort data = zip(*sp.find(matrix)) if not isinstance(data, list): data = list(data) # possibly convert from iterator to a list data.sort() idata = 0 # index of the next element to be consumed from `data` for irow in range(matrix.shape[0]): if idata < len(data) and data[idata][0] == irow and data[idata][1] == 0: y = data[idata][2] idata += 1 else: y = 0 out.write(str(y)) while idata < len(data) and data[idata][0] == irow: out.write(" ") out.write(str(data[idata][1])) out.write(":") out.write(str(data[idata][2])) idata += 1 out.write("\n") out.close() ret = h2o.api("POST /3/PostFile", filename=tmp_path) os.remove(tmp_path) rawkey = ret["destination_frame"] p = {"source_frames": [rawkey], "destination_frame": destination_frame} H2OJob(h2o.api("POST /3/ParseSVMLight", data=p), "Parse").poll() self._ex._cache._id = destination_frame self._ex._cache.fill()
[docs] @staticmethod def get_frame(frame_id, rows=10, rows_offset=0, cols=-1, full_cols=-1, cols_offset=0, light=False): """ Retrieve an existing H2OFrame from the H2O cluster using the frame's id. :param str frame_id: id of the frame to retrieve :param int rows: number of rows to fetch for preview (10 by default) :param int rows_offset: offset to fetch rows from (0 by default) :param int cols: number of columns to fetch (all by default) :param full_cols: number of columns to fetch together with backed data :param int cols_offset: offset to fetch rows from (0 by default) :param bool light: wether to use light frame endpoint or not :returns: an existing H2OFrame with the id provided; or None if such frame doesn't exist. """ fr = H2OFrame() fr._ex._cache._id = frame_id try: fr._ex._cache.fill(rows=rows, rows_offset=rows_offset, cols=cols, full_cols=full_cols, cols_offset=cols_offset, light=light) except EnvironmentError: return None return fr
@staticmethod def _validate(param, name, required=False, message=None): message = message or "'{}' must be a valid H2OFrame!".format(name) if param is None: if required: raise ValueError(message) else: return else: assert_is_type(param, H2OFrame, message=message) return param
[docs] def refresh(self): """Reload frame information from the backend H2O server.""" self._ex._cache.flush() self._frame(fill_cache=True)
#------------------------------------------------------------------------------------------------------------------- # Frame properties #------------------------------------------------------------------------------------------------------------------- @property def key(self): return None if self._ex is None else self._ex._cache._id @property def names(self): """The list of column names (List[str]).""" if not self._ex._cache.names_valid(): self._ex._cache.flush() self._frame(fill_cache=True) return list(self._ex._cache.names) @names.setter def names(self, value): self.set_names(value) @property def nrows(self): """Number of rows in the dataframe (int).""" if not self._ex._cache.nrows_valid(): self._ex._cache.flush() self._frame(fill_cache=True) return self._ex._cache.nrows @property def ncols(self): """Number of columns in the dataframe (int).""" if not self._ex._cache.ncols_valid(): self._ex._cache.flush() self._frame(fill_cache=True) return self._ex._cache.ncols @property def shape(self): """Number of rows and columns in the dataframe as a tuple ``(nrows, ncols)``.""" return self.nrows, self.ncols @property def types(self): """The dictionary of column name/type pairs.""" if not self._ex._cache.types_valid(): self._ex._cache.flush() self._frame(fill_cache=True) return dict(self._ex._cache.types) @property def frame_id(self): """Internal id of the frame (str).""" return self._frame()._ex._cache._id @frame_id.setter def frame_id(self, newid): check_frame_id(newid) if self._ex._cache._id is None: h2o.assign(self, newid) else: oldname = self.frame_id self._ex._cache._id = newid h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, newid))
[docs] def type(self, col): """ The type for the given column. :param col: either a name, or an index of the column to look up :returns: type of the column, one of: ``str``, ``int``, ``real``, ``enum``, ``time``, ``bool``. :raises H2OValueError: if such column does not exist in the frame. """ assert_is_type(col, int, str) if not self._ex._cache.types_valid() or not self._ex._cache.names_valid(): self._ex._cache.flush() self._frame(fill_cache=True) types = self._ex._cache.types if is_type(col, str): if col in types: return types[col] else: names = self._ex._cache.names if -len(names) <= col < len(names): return types[names[col]] raise H2OValueError("Column '%r' does not exist in the frame" % col)
def _import_parse(self, path, pattern, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns=None, custom_non_data_line_markers = None): if H2OFrame.__LOCAL_EXPANSION_ON_SINGLE_IMPORT__ and is_type(path, str) and "://" not in path: # fixme: delete those 2 lines, cf. PUBDEV-5717 path = os.path.abspath(path) rawkey = h2o.lazy_import(path, pattern) self._parse(rawkey, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns, custom_non_data_line_markers) return self def _upload_parse(self, path, destination_frame, header, sep, column_names, column_types, na_strings, skipped_columns=None): ret = h2o.api("POST /3/PostFile", filename=path) rawkey = ret["destination_frame"] self._parse(rawkey, destination_frame, header, sep, column_names, column_types, na_strings, skipped_columns) return self def _parse(self, rawkey, destination_frame="", header=None, separator=None, column_names=None, column_types=None, na_strings=None, skipped_columns=None, custom_non_data_line_markers = None): setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns, custom_non_data_line_markers) return self._parse_raw(setup) def _parse_raw(self, setup): # Parse parameters (None values provided by setup) p = {"destination_frame": None, "parse_type": None, "separator": None, "single_quotes": None, "check_header": None, "number_columns": None, "chunk_size": None, "delete_on_done": True, "blocking": False, "column_types": None, "skipped_columns":None, "custom_non_data_line_markers": setup["custom_non_data_line_markers"] } if setup["column_names"]: p["column_names"] = None if setup["na_strings"]: p["na_strings"] = None p.update({k: v for k, v in viewitems(setup) if k in p}) # Extract only 'name' from each src in the array of srcs p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']] H2OJob(h2o.api("POST /3/Parse", data=p), "Parse").poll() # Need to return a Frame here for nearly all callers # ... but job stats returns only a dest_key, requiring another REST call to get nrow/ncol self._ex._cache._id = p["destination_frame"] self._ex._cache.fill()
[docs] def filter_na_cols(self, frac=0.2): """ Filter columns with proportion of NAs greater or equals than ``frac``. :param float frac: Maximum fraction of NAs in the column to keep. :returns: A list of indices of columns that have fewer NAs than ``frac``. If all columns are filtered, None is returned. """ return ExprNode("filterNACols", self, frac)._eager_scalar()
[docs] def columns_by_type(self, coltype="numeric"): """ Extract columns of the specified type from the frame. :param str coltype: A character string indicating which column type to filter by. This must be one of the following: - ``"numeric"`` - Numeric, but not categorical or time - ``"categorical"`` - Integer, with a categorical/factor String mapping - ``"string"`` - String column - ``"time"`` - Long msec since the Unix Epoch - with a variety of display/parse options - ``"uuid"`` - UUID - ``"bad"`` - No none-NA rows (triple negative! all NAs or zero rows) :returns: list of indices of columns that have the requested type """ assert_is_type(coltype, "numeric", "categorical", "string", "time", "uuid", "bad") assert_is_type(self, H2OFrame) return ExprNode("columnsByType", self, coltype)._eager_scalar()
def __iter__(self): return (self[i] for i in range(self.ncol)) def __unicode__(self): if sys.gettrace() is None: if self._ex is None: return "This H2OFrame has been removed." table = self._frame(fill_cache=True)._ex._cache._tabulate("simple", False) nrows = "%d %s" % (self.nrow, "row" if self.nrow == 1 else "rows") ncols = "%d %s" % (self.ncol, "column" if self.ncol == 1 else "columns") return "%s\n\n[%s x %s]" % (table, nrows, ncols) return "" def __repr__(self): if sys.gettrace() is None: # PUBDEV-2278: using <method>? from IPython caused everything to dump stk = traceback.extract_stack() if not ("IPython" in stk[-2][0] and "info" == stk[-2][2]): self.show() return "" def _has_content(self): return self._ex and (self._ex._children or self._ex._cache._id)
[docs] def show(self, use_pandas=False, rows=10, cols=200): """ Used by the H2OFrame.__repr__ method to print or display a snippet of the data frame. If called from IPython, displays the results in HTML format. Otherwise, this prints a tabulated result. """ if self._ex is None: print("This H2OFrame has been removed.") return if not self._has_content(): print("This H2OFrame is empty and not initialized.") return if self.nrows == 0: print("This H2OFrame is empty.") return if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill() if H2ODisplay._in_zep(): print("%html " + self._ex._cache._tabulate("html", False, rows=rows)) elif H2ODisplay._in_ipy(): import IPython.display if use_pandas and can_use_pandas(): IPython.display.display(self.head(rows=rows, cols=cols).as_data_frame(use_pandas=True)) else: IPython.display.display_html(self._ex._cache._tabulate("html", False, rows=rows), raw=True) else: if use_pandas and can_use_pandas(): print(self.head(rows=rows, cols=cols).as_data_frame(use_pandas=True)) else: s = self.__unicode__() stk = traceback.extract_stack() if "IPython" in stk[-3][0]: s = "\n%s" % s try: print(s) except UnicodeEncodeError: print(s.encode("ascii", "replace"))
[docs] def summary(self, return_data=False): """ Display summary information about the frame. Summary includes min/mean/max/sigma and other rollup data. :param bool return_data: Return a dictionary of the summary output """ if not self._has_content(): print("This H2OFrame is empty and not initialized.") return self._ex._cache._data; if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill() if not return_data: if self.nrows == 0: print("This H2OFrame is empty.") elif H2ODisplay._in_zep(): print("%html " + self._ex._cache._tabulate("html", True)) elif H2ODisplay._in_ipy(): import IPython.display IPython.display.display_html(self._ex._cache._tabulate("html", True), raw=True) else: print(self._ex._cache._tabulate("simple", True)) else: return self._ex._cache._data
[docs] def describe(self, chunk_summary=False): """ Generate an in-depth description of this H2OFrame. This will print to the console the dimensions of the frame; names/types/summary statistics for each column; and finally first ten rows of the frame. :param bool chunk_summary: Retrieve the chunk summary along with the distribution summary """ if self._has_content(): res = h2o.api("GET /3/Frames/%s" % self.frame_id, data={"row_count": 10})["frames"][0] self._ex._cache._fill_data(res) print("Rows:{}".format(self.nrow)) print("Cols:{}".format(self.ncol)) # The chunk & distribution summaries are not cached, so must be pulled if chunk_summary=True. if chunk_summary: res["chunk_summary"].show() res["distribution_summary"].show() print("\n") self.summary()
[docs] def detach(self): self._ex = None
def _frame(self, rows=10, rows_offset=0, cols=-1, cols_offset=0, fill_cache=False): self._ex._eager_frame() if fill_cache: self._ex._cache.fill(rows=rows, rows_offset=rows_offset, cols=cols, cols_offset=cols_offset) return self
[docs] def head(self, rows=10, cols=200): """ Return the first ``rows`` and ``cols`` of the frame as a new H2OFrame. :param int rows: maximum number of rows to return :param int cols: maximum number of columns to return :returns: a new H2OFrame cut from the top left corner of the current frame, and having dimensions at most ``rows`` x ``cols``. """ assert_is_type(rows, int) assert_is_type(cols, int) nrows = min(self.nrows, rows) ncols = min(self.ncols, cols) newdt = self[:nrows, :ncols] return newdt._frame(rows=nrows, cols=cols, fill_cache=True)
[docs] def tail(self, rows=10, cols=200): """ Return the last ``rows`` and ``cols`` of the frame as a new H2OFrame. :param int rows: maximum number of rows to return :param int cols: maximum number of columns to return :returns: a new H2OFrame cut from the bottom left corner of the current frame, and having dimensions at most ``rows`` x ``cols``. """ assert_is_type(rows, int) assert_is_type(cols, int) nrows = min(self.nrows, rows) ncols = min(self.ncols, cols) start_idx = self.nrows - nrows newdt = self[start_idx:start_idx + nrows, :ncols] return newdt._frame(rows=nrows, cols=cols, fill_cache=True)
[docs] def logical_negation(self): """ Returns new H2OFrame equal to elementwise Logical NOT applied to the current frame. """ return H2OFrame._expr(expr=ExprNode("not", self), cache=self._ex._cache)
def _unop(self, op, rtype="real"): if self._is_frame: for cname, ctype in self.types.items(): if ctype not in {"int", "real", "bool"}: raise H2OValueError("Function %s cannot be applied to %s column '%s'" % (op, ctype, cname)) ret = H2OFrame._expr(expr=ExprNode(op, self), cache=self._ex._cache) ret._ex._cache._names = ["%s(%s)" % (op, name) for name in self._ex._cache._names] ret._ex._cache._types = {name: rtype for name in ret._ex._cache._names} return ret # Binary operations def __add__(self, rhs): return _binop(self, "+", rhs) def __sub__(self, rhs): return _binop(self, "-", rhs) def __mul__(self, rhs): return _binop(self, "*", rhs) def __div__(self, rhs): return _binop(self, "/", rhs) def __truediv__(self, rhs): return _binop(self, "/", rhs) def __floordiv__(self, rhs): return _binop(self, "intDiv", rhs) def __mod__(self, rhs): return _binop(self, "%", rhs) def __or__(self, rhs): return _binop(self, "|", rhs, rtype="bool") def __and__(self, rhs): return _binop(self, "&", rhs, rtype="bool") def __ge__(self, rhs): return _binop(self, ">=", rhs, rtype="bool") def __gt__(self, rhs): return _binop(self, ">", rhs, rtype="bool") def __le__(self, rhs): return _binop(self, "<=", rhs, rtype="bool") def __lt__(self, rhs): return _binop(self, "<", rhs, rtype="bool") def __eq__(self, rhs): if rhs is None: rhs = float("nan") return _binop(self, "==", rhs, rtype="bool") def __ne__(self, rhs): if rhs is None: rhs = float("nan") return _binop(self, "!=", rhs, rtype="bool") def __pow__(self, rhs): return _binop(self, "^", rhs) def __contains__(self, lhs): return all((t == self).any() for t in lhs) if _is_list(lhs) else (lhs == self).any() # rops def __rmod__(self, lhs): return _binop(lhs, "%", self) def __radd__(self, lhs): return _binop(lhs, "+", self) def __rsub__(self, lhs): return _binop(lhs, "-", self) def __rand__(self, lhs): return _binop(lhs, "&", self, rtype="bool") def __ror__(self, lhs): return _binop(lhs, "|", self, rtype="bool") def __rtruediv__(self, lhs): return _binop(lhs, "/", self) def __rdiv__(self, lhs): return _binop(lhs, "/", self) def __rfloordiv__(self, lhs): return _binop(lhs, "intDiv", self, rtype="int") def __rmul__(self, lhs): return _binop(lhs, "*", self) def __rpow__(self, lhs): return _binop(lhs, "^", self) # unops def __abs__(self): return self._unop("abs") def __invert__(self): return self._unop("!!", rtype="bool") def __nonzero__(self): if self.nrows > 1 or self.ncols > 1: raise H2OValueError( 'This operation is not supported on an H2OFrame. Try using parentheses. ' 'Did you mean & (logical and), | (logical or), or ~ (logical not)?') else: return self.__len__() def __int__(self): return int(self.flatten()) def __float__(self): return float(self.flatten())
[docs] def flatten(self): """ Convert a 1x1 frame into a scalar. :returns: content of this 1x1 frame as a scalar (``int``, ``float``, or ``str``). :raises H2OValueError: if current frame has shape other than 1x1 """ if self.shape != (1, 1): raise H2OValueError("Not a 1x1 Frame") return ExprNode("flatten", self)._eager_scalar()
[docs] def getrow(self): """ Convert a 1xn frame into an n-element list. :returns: content of this 1xn frame as a Python list. :raises H2OValueError: if current frame has more than one row. """ if self.nrows != 1: raise H2OValueError("This method can only be applied to single-row frames") return ExprNode("getrow", self)._eager_scalar()
[docs] def mult(self, matrix): """ Multiply this frame, viewed as a matrix, by another matrix. :param matrix: another frame that you want to multiply the current frame by; must be compatible with the current frame (i.e. its number of rows must be the same as number of columns in the current frame). :returns: new H2OFrame, which is the result of multiplying the current frame by ``matrix``. """ if self.ncols != matrix.nrows: raise H2OValueError("Matrix is not compatible for multiplication with the current frame") return H2OFrame._expr(expr=ExprNode("x", self, matrix))
[docs] def cos(self): """Return new H2OFrame equal to elementwise cosine of the current frame.""" return self._unop("cos")
[docs] def sin(self): """Return new H2OFrame equal to elementwise sine of the current frame.""" return self._unop("sin")
[docs] def tan(self): """Return new H2OFrame equal to elementwise tangent of the current frame.""" return self._unop("tan")
[docs] def acos(self): """Return new H2OFrame equal to elementwise arc cosine of the current frame.""" return self._unop("acos")
[docs] def asin(self): """Return new H2OFrame equal to elementwise arc sine of the current frame.""" return self._unop("asin")
[docs] def atan(self): """Return new H2OFrame equal to elementwise arc tangent of the current frame.""" return self._unop("atan")
[docs] def cosh(self): """Make new H2OFrame with values equal to the hyperbolic cosines of the values in the current frame.""" return self._unop("cosh")
[docs] def sinh(self): """Return new H2OFrame equal to elementwise hyperbolic sine of the current frame.""" return self._unop("sinh")
[docs] def tanh(self): """Return new H2OFrame equal to elementwise hyperbolic tangent of the current frame.""" return self._unop("tanh")
[docs] def acosh(self): """Return new H2OFrame equal to elementwise inverse hyperbolic cosine of the current frame.""" return self._unop("acosh")
[docs] def asinh(self): """Return new H2OFrame equal to elementwise inverse hyperbolic sine of the current frame.""" return self._unop("asinh")
[docs] def atanh(self): """Return new H2OFrame equal to elementwise inverse hyperbolic tangent of the current frame.""" return self._unop("atanh")
[docs] def cospi(self): """Return new H2OFrame equal to elementwise cosine of the current frame multiplied by Pi.""" return self._unop("cospi")
[docs] def sinpi(self): """Return new H2OFrame equal to elementwise sine of the current frame multiplied by Pi.""" return self._unop("sinpi")
[docs] def tanpi(self): """Return new H2OFrame equal to elementwise tangent of the current frame multiplied by Pi.""" return self._unop("tanpi")
[docs] def abs(self): """Return new H2OFrame equal to elementwise absolute value of the current frame.""" return self._unop("abs")
[docs] def sign(self): """Return new H2OFrame equal to signs of the values in the frame: -1 , +1, or 0.""" return self._unop("sign", rtype="int")
[docs] def sqrt(self): """Return new H2OFrame equal to elementwise square root of the current frame.""" return self._unop("sqrt")
[docs] def trunc(self): """ Apply the numeric truncation function. ``trunc(x)`` is the integer obtained from ``x`` by dropping its decimal tail. This is equal to ``floor(x)`` if ``x`` is positive, and ``ceil(x)`` if ``x`` is negative. Truncation is also called "rounding towards zero". :returns: new H2OFrame of truncated values of the original frame. """ return self._unop("trunc", rtype="int")
[docs] def ceil(self): """ Apply the ceiling function to the current frame. ``ceil(x)`` is the smallest integer greater or equal to ``x``. :returns: new H2OFrame of ceiling values of the original frame. """ return self._unop("ceiling", rtype="int")
[docs] def floor(self): """ Apply the floor function to the current frame. ``floor(x)`` is the largest integer smaller or equal to ``x``. :returns: new H2OFrame of floor values of the original frame. """ return self._unop("floor", rtype="int")
[docs] def log(self): """Return new H2OFrame equals to elementwise natural logarithm of the current frame.""" return self._unop("log")
[docs] def log10(self): """Return new H2OFrame equals to elementwise decimal logarithm of the current frame.""" return self._unop("log10")
[docs] def log1p(self): """Return new H2OFrame equals to elementwise ``ln(1 + x)`` for each ``x`` in the current frame.""" return self._unop("log1p")
[docs] def log2(self): """Return new H2OFrame equals to elementwise binary logarithm of the current frame.""" return self._unop("log2")
[docs] def exp(self): """Return new H2OFrame equals to elementwise exponent (i.e. ``e^x``) of the current frame.""" return self._unop("exp")
[docs] def expm1(self): """Return new H2OFrame equals to elementwise exponent minus 1 (i.e. ``e^x - 1``) of the current frame.""" return self._unop("expm1")
[docs] def gamma(self): """Return new H2OFrame equals to elementwise gamma function of the current frame.""" return self._unop("gamma")
[docs] def lgamma(self): """Return new H2OFrame equals to elementwise logarithm of the gamma function of the current frame.""" return self._unop("lgamma")
[docs] def digamma(self): """Return new H2OFrame equals to elementwise digamma function of the current frame.""" return self._unop("digamma")
[docs] def trigamma(self): """Return new H2OFrame equals to elementwise trigamma function of the current frame.""" return self._unop("trigamma")
[docs] @staticmethod def moment(year=None, month=None, day=None, hour=None, minute=None, second=None, msec=None, date=None, time=None): """ Create a time column from individual components. Each parameter should be either an integer, or a single-column H2OFrame containing the corresponding time parts for each row. The "date" part of the timestamp can be specified using either the tuple ``(year, month, day)``, or an explicit ``date`` parameter. The "time" part of the timestamp is optional, but can be specified either via the ``time`` parameter, or via the ``(hour, minute, second, msec)`` tuple. :param year: the year part of the constructed date :param month: the month part of the constructed date :param day: the day-of-the-month part of the constructed date :param hour: the hours part of the constructed date :param minute: the minutes part of the constructed date :param second: the seconds part of the constructed date :param msec: the milliseconds part of the constructed date :param date date: construct the timestamp from the Python's native ``datetime.date`` (or ``datetime.datetime``) object. If the object passed is of type ``date``, then you can specify the time part using either the ``time`` argument, or ``hour`` ... ``msec`` arguments (but not both). If the object passed is of type ``datetime``, then no other arguments can be provided. :param time time: construct the timestamp from this Python's native ``datetime.time`` object. This argument cannot be used alone, it should be supplemented with either ``date`` argument, or ``year`` ... ``day`` tuple. :returns: H2OFrame with one column containing the date constructed from the provided arguments. """ assert_is_type(date, None, datetime.date, numpy_datetime, pandas_timestamp) assert_is_type(time, None, datetime.time) assert_is_type(year, None, int, H2OFrame) assert_is_type(month, None, int, H2OFrame) assert_is_type(day, None, int, H2OFrame) assert_is_type(hour, None, int, H2OFrame) assert_is_type(minute, None, int, H2OFrame) assert_is_type(second, None, int, H2OFrame) assert_is_type(msec, None, int, H2OFrame) if time is not None: if hour is not None or minute is not None or second is not None or msec is not None: raise H2OValueError("Arguments hour, minute, second, msec cannot be used together with `time`.") hour = time.hour minute = time.minute second = time.second msec = time.microsecond // 1000 if date is not None: if is_type(date, pandas_timestamp): date = date.to_pydatetime() if is_type(date, numpy_datetime): date = date.astype("M8[ms]").astype("O") if year is not None or month is not None or day is not None: raise H2OValueError("Arguments year, month and day cannot be used together with `date`.") year = date.year month = date.month day = date.day if isinstance(date, datetime.datetime): if time is not None: raise H2OValueError("Argument `time` cannot be used together with `date` of datetime type.") if hour is not None or minute is not None or second is not None or msec is not None: raise H2OValueError("Arguments hour, minute, second, msec cannot be used together with `date` " "of datetime type.") hour = date.hour minute = date.minute second = date.second msec = date.microsecond // 1000 if year is None or month is None or day is None: raise H2OValueError("Either arguments (`year`, `month` and `day`) or the `date` are required.") if hour is None: hour = 0 if minute is None: minute = 0 if second is None: second = 0 if msec is None: msec = 0 local_vars = locals() res_nrows = None for n in ["year", "month", "day", "hour", "minute", "second", "msec"]: x = local_vars[n] if isinstance(x, H2OFrame): if x.ncols != 1: raise H2OValueError("Argument `%s` is a frame with more than 1 column" % n) if x.type(0) not in {"int", "real"}: raise H2OValueError("Column `%s` is not numeric (type = %s)" % (n, x.type(0))) if res_nrows is None: res_nrows = x.nrows if x.nrows == 0 or x.nrows != res_nrows: raise H2OValueError("Incompatible column `%s` having %d rows" % (n, x.nrows)) if res_nrows is None: res_nrows = 1 res = H2OFrame._expr(ExprNode("moment", year, month, day, hour, minute, second, msec)) res._ex._cache._names = ["name"] res._ex._cache._types = {"name": "time"} res._ex._cache._nrows = res_nrows res._ex._cache._ncols = 1 return res
[docs] def unique(self): """ Extract the unique values in the column. :returns: H2OFrame of just the unique values in the column. """ return H2OFrame._expr(expr=ExprNode("unique", self))
[docs] def levels(self): """ Get the factor levels. :returns: A list of lists, one list per column, of levels. """ lol = H2OFrame._expr(expr=ExprNode("levels", self)).as_data_frame(False) lol.pop(0) # Remove column headers lol = list(zip(*lol)) return [[ll for ll in l if ll != ''] for l in lol]
[docs] def nlevels(self): """ Get the number of factor levels for each categorical column. :returns: A list of the number of levels per column. """ levels = self.levels() return [len(l) for l in levels] if levels else 0
[docs] def set_level(self, level): """ A method to set all column values to one of the levels. :param str level: The level at which the column will be set (a string) :returns: H2OFrame with entries set to the desired level. """ return H2OFrame._expr(expr=ExprNode("setLevel", self, level), cache=self._ex._cache)
[docs] def set_levels(self, levels): """ Replace the levels of a categorical column. New levels must be aligned with the old domain. This call has copy-on-write semantics. :param List[str] levels: A list of strings specifying the new levels. The number of new levels must match the number of old levels. :returns: A single-column H2OFrame with the desired levels. """ assert_is_type(levels, [str]) return H2OFrame._expr(expr=ExprNode("setDomain", self, False, levels), cache=self._ex._cache)
[docs] def rename(self, columns=None): """ Change names of columns in the frame. Dict key is an index or name of the column whose name is to be set. Dict value is the new name of the column. :param columns: dict-like transformations to apply to the column names """ assert_is_type(columns, None, dict) new_names = self.names ncols = self.ncols for col, name in columns.items(): col_index = None if is_type(col, int) and (-ncols <= col < ncols): col_index = (col + ncols) % ncols # handle negative indices elif is_type(col, str) and col in self.names: col_index = self.names.index(col) # lookup the name if col_index is not None: new_names[col_index] = name return self.set_names(new_names)
[docs] def set_names(self, names): """ Change names of all columns in the frame. :param List[str] names: The list of new names for every column in the frame. """ assert_is_type(names, [str]) assert_satisfies(names, len(names) == self.ncol) self._ex = ExprNode("colnames=", self, range(self.ncol), names) # Update-in-place, but still lazy return self
[docs] def set_name(self, col=None, name=None): """ Set a new name for a column. :param col: index or name of the column whose name is to be set; may be skipped for 1-column frames :param name: the new name of the column """ assert_is_type(col, None, int, str) assert_is_type(name, str) ncols = self.ncols col_index = None if is_type(col, int): if not(-ncols <= col < ncols): raise H2OValueError("Index %d is out of bounds for a frame with %d columns" % (col, ncols)) col_index = (col + ncols) % ncols # handle negative indices elif is_type(col, str): if col not in self.names: raise H2OValueError("Column %s doesn't exist in the frame." % col) col_index = self.names.index(col) # lookup the name else: assert col is None if ncols != 1: raise H2OValueError("The frame has %d columns; please specify which one to rename" % ncols) col_index = 0 if name != self.names[col_index] and name in self.types: raise H2OValueError("Column '%s' already exists in the frame" % name) oldname = self.names[col_index] old_cache = self._ex._cache self._ex = ExprNode("colnames=", self, col_index, name) # Update-in-place, but still lazy self._ex._cache.fill_from(old_cache) if self.names is None: self._frame()._ex._cache.fill() else: self._ex._cache._names = self.names[:col_index] + [name] + self.names[col_index + 1:] self._ex._cache._types[name] = self._ex._cache._types.pop(oldname) return
[docs] def as_date(self, format): """ Convert the frame (containing strings / categoricals) into the ``date`` format. :param str format: the format string (e.g. "%Y-%m-%d") :returns: new H2OFrame with "int" column types """ fr = H2OFrame._expr(expr=ExprNode("as.Date", self, format), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def cumsum(self, axis=0): """ Compute cumulative sum over rows / columns of the frame. :param int axis: 0 for column-wise, 1 for row-wise :returns: new H2OFrame with cumulative sums of the original frame. """ return H2OFrame._expr(expr=ExprNode("cumsum", self, axis), cache=self._ex._cache)
[docs] def cumprod(self, axis=0): """ Compute cumulative product over rows / columns of the frame. :param int axis: 0 for column-wise, 1 for row-wise :returns: new H2OFrame with cumulative products of the original frame. """ return H2OFrame._expr(expr=ExprNode("cumprod", self, axis), cache=self._ex._cache)
[docs] def cummin(self, axis=0): """ Compute cumulative minimum over rows / columns of the frame. :param int axis: 0 for column-wise, 1 for row-wise :returns: new H2OFrame with running minimums of the original frame. """ return H2OFrame._expr(expr=ExprNode("cummin", self, axis), cache=self._ex._cache)
[docs] def cummax(self, axis=0): """ Compute cumulative maximum over rows / columns of the frame. :param int axis: 0 for column-wise, 1 for row-wise :returns: new H2OFrame with running maximums of the original frame. """ return H2OFrame._expr(expr=ExprNode("cummax", self, axis), cache=self._ex._cache)
[docs] def prod(self, na_rm=False): """ Compute the product of all values across all rows in a single column H2O frame. If you apply this command on a multi-column H2O frame, the answer may not be correct. :param bool na_rm: If True then NAs will be ignored during the computation. :returns: product of all values in the frame (a float) """ return ExprNode("prod.na" if na_rm else "prod", self)._eager_scalar()
[docs] def any(self): """Return True if any element in the frame is either True, non-zero or NA.""" return bool(ExprNode("any", self)._eager_scalar())
[docs] def any_na_rm(self): """Return True if any value in the frame is non-zero (disregarding all NAs).""" return bool(ExprNode("any.na", self)._eager_scalar())
[docs] def all(self): """Return True if every element in the frame is either True, non-zero or NA.""" return bool(ExprNode("all", self)._eager_scalar())
[docs] def isnumeric(self): """ Test which columns in the frame are numeric. :returns: a list of True/False indicating for each column in the frame whether it is numeric. """ return [bool(o) for o in ExprNode("is.numeric", self)._eager_scalar()]
[docs] def isstring(self): """ Test which columns in the frame are string. :returns: a list of True/False indicating for each column in the frame whether it is numeric. """ return [bool(o) for o in ExprNode("is.character", self)._eager_scalar()]
[docs] def isin(self, item): """ Test whether elements of an H2OFrame are contained in the ``item``. :param items: An item or a list of items to compare the H2OFrame against. :returns: An H2OFrame of 0s and 1s showing whether each element in the original H2OFrame is contained in item. """ if is_type(item, list, tuple, set): if self.ncols == 1 and (self.type(0) == 'str' or self.type(0) == 'enum'): return self.match(item) else: return functools.reduce(H2OFrame.__or__, (self == i for i in item)) else: return self == item
[docs] def kfold_column(self, n_folds=3, seed=-1): """ Build a fold assignments column for cross-validation. This method will produce a column having the same data layout as the source frame. :param int n_folds: An integer specifying the number of validation sets to split the training data into. :param int seed: Seed for random numbers as fold IDs are randomly assigned. :returns: A single column H2OFrame with the fold assignments. """ return H2OFrame._expr(expr=ExprNode("kfold_column", self, n_folds, seed))._frame() # want this to be eager!
[docs] def modulo_kfold_column(self, n_folds=3): """ Build a fold assignments column for cross-validation. Rows are assigned a fold according to the current row number modulo ``n_folds``. :param int n_folds: An integer specifying the number of validation sets to split the training data into. :returns: A single-column H2OFrame with the fold assignments. """ return H2OFrame._expr(expr=ExprNode("modulo_kfold_column", self, n_folds))._frame() # want this to be eager!
[docs] def stratified_kfold_column(self, n_folds=3, seed=-1): """ Build a fold assignment column with the constraint that each fold has the same class distribution as the fold column. :param int n_folds: The number of folds to build. :param int seed: A seed for the random number generator. :returns: A single column H2OFrame with the fold assignments. """ return H2OFrame._expr( expr=ExprNode("stratified_kfold_column", self, n_folds, seed))._frame() # want this to be eager!
[docs] def structure(self): """Compactly display the internal structure of an H2OFrame.""" df = self.as_data_frame(use_pandas=False) cn = df.pop(0) nr = self.nrow nc = self.ncol width = max([len(c) for c in cn]) isfactor = self.isfactor() numlevels = self.nlevels() lvls = self.levels() print("H2OFrame: '{}' \nDimensions: {} obs. of {} variables".format(self.frame_id, nr, nc)) for i in range(nc): print("$ {} {}: ".format(cn[i], ' ' * (width - max(0, len(cn[i])))), end=' ') if isfactor[i]: nl = numlevels[i] print("Factor w/ {} level(s) {} ".format(nl, '"' + '","'.join(lvls[i]) + '"'), end='\n') else: print("num {}".format(" ".join(it[0] if it else "nan" for it in h2o.as_list(self[:10, i], False)[1:])))
[docs] def as_data_frame(self, use_pandas=True, header=True): """ Obtain the dataset as a python-local object. :param bool use_pandas: If True (default) then return the H2OFrame as a pandas DataFrame (requires that the ``pandas`` library was installed). If False, then return the contents of the H2OFrame as plain nested list, in a row-wise order. :param bool header: If True (default), then column names will be appended as the first row in list :returns: A python object (a list of lists of strings, each list is a row, if use_pandas=False, otherwise a pandas DataFrame) containing this H2OFrame instance's data. """ if can_use_pandas() and use_pandas: import pandas return pandas.read_csv(StringIO(self.get_frame_data()), low_memory=False, skip_blank_lines=False) from h2o.utils.csv.readers import reader frame = [row for row in reader(StringIO(self.get_frame_data()))] if not header: frame.pop(0) return frame
[docs] def get_frame_data(self): """ Get frame data as a string in csv format. This will create a multiline string, where each line will contain a separate row of frame's data, with individual values separated by commas. """ return h2o.api("GET /3/DownloadDataset", data={"frame_id": self.frame_id, "hex_string": False})
def __getitem__(self, item): """ Frame slicing, supports row and column slicing. :param item: selector of a subframe. This can be one of the following: - an int, indicating selection of a single column at the specified index (0-based) - a string, selecting a column with the given name - a list of ints or strings, selecting several columns with the given indices / names - a slice, selecting columns with the indices within this slice - a single-column boolean frame, selecting rows for which the selector is true - a 2-element tuple, where the first element is a row selector, and the second element is the column selector. Here the row selector may be one of: an int, a list of ints, a slice, or a boolean frame. The column selector is similarly one of: an int, a list of ints, a string, a list of strings, or a slice. It is also possible to use the empty slice (``:``) to select all elements within one of the dimensions. :returns: A new frame comprised of some rows / columns of the source frame. :examples: >>> fr[2] # All rows, 3rd column >>> fr[-2] # All rows, 2nd column from end >>> fr[:, -1] # All rows, last column >>> fr[0:5, :] # First 5 rows, all columns >>> fr[fr[0] > 1, :] # Only rows where first cell is greater than 1, all columns >>> fr[[1, 5, 6]] # Columns 2, 6, and 7 >>> fr[0:50, [1,2,3]] # First 50 rows, columns 2, 3, and 4 """ # Select columns based on a string, a list of strings, an int or a slice. # Note that the python column selector handles the case of negative # selections, or out-of-range selections - without having to compute # self._ncols in the front-end - which would force eager evaluation just to # range check in the front-end. new_ncols = -1 new_nrows = -1 new_names = None new_types = None fr = None flatten = False if isinstance(item, slice): item = normalize_slice(item, self.ncols) if is_type(item, str, int, list, slice): new_ncols, new_names, new_types, item = self._compute_ncol_update(item) new_nrows = self.nrow fr = H2OFrame._expr(expr=ExprNode("cols_py", self, item)) elif isinstance(item, (ExprNode, H2OFrame)): new_ncols = self.ncol new_names = self.names new_types = self.types new_nrows = -1 # have a "big" predicate column -- update cache later on... fr = H2OFrame._expr(expr=ExprNode("rows", self, item)) elif isinstance(item, tuple): rows, cols = item allrows = allcols = False if isinstance(cols, slice): cols = normalize_slice(cols, self.ncols) allcols = cols == slice(0, self.ncols, 1) if isinstance(rows, slice): rows = normalize_slice(rows, self.nrows) allrows = rows == slice(0, self.nrows, 1) if allrows and allcols: return self # fr[:,:] -> all rows and columns.. return self if allrows: new_ncols, new_names, new_types, cols = self._compute_ncol_update(cols) new_nrows = self.nrow fr = H2OFrame._expr(expr=ExprNode("cols_py", self, cols)) # fr[:,cols] -> really just a column slice if allcols: new_ncols = self.ncols new_names = self.names new_types = self.types new_nrows, rows = self._compute_nrow_update(rows) fr = H2OFrame._expr(expr=ExprNode("rows", self, rows)) # fr[rows,:] -> really just a row slices if not allrows and not allcols: new_ncols, new_names, new_types, cols = self._compute_ncol_update(cols) new_nrows, rows = self._compute_nrow_update(rows) fr = H2OFrame._expr(expr=ExprNode("rows", ExprNode("cols_py", self, cols), rows)) flatten = is_type(rows, int) and is_type(cols, str, int) else: raise ValueError("Unexpected __getitem__ selector: " + str(type(item)) + " " + str(item.__class__)) assert fr is not None # Pythonic: if the row & col selector turn into ints (or a single col # name), then extract the single element out of the Frame. Otherwise # return a Frame, EVEN IF the selectors are e.g. slices-of-1-value. if flatten: return fr.flatten() fr._ex._cache.ncols = new_ncols fr._ex._cache.nrows = new_nrows fr._ex._cache.names = new_names fr._ex._cache.types = new_types fr._is_frame = self._is_frame return fr def _compute_ncol_update(self, item): # computes new ncol, names, and types new_ncols = -1 if isinstance(item, list): new_ncols = len(item) if _is_str_list(item): new_types = {k: self.types[k] for k in item} new_names = item else: new_names = [self.names[i] for i in item] new_types = {name: self.types[name] for name in new_names} elif isinstance(item, slice): assert slice_is_normalized(item) new_names = self.names[item] new_types = {name: self.types[name] for name in new_names} elif is_type(item, str, int): new_ncols = 1 if is_type(item, str): new_names = [item] new_types = None if item not in self.types else {item: self.types[item]} else: new_names = [self.names[item]] new_types = {new_names[0]: self.types[new_names[0]]} else: raise ValueError("Unexpected type: " + str(type(item))) return (new_ncols, new_names, new_types, item) def _compute_nrow_update(self, item): if isinstance(item, list): new_nrows = len(item) elif isinstance(item, slice): assert slice_is_normalized(item) new_nrows = (item.stop - item.start + item.step - 1) // item.step elif isinstance(item, H2OFrame): new_nrows = -1 else: new_nrows = 1 return [new_nrows, item] def __setitem__(self, item, value): """ Replace, update or add column(s) in an H2OFrame. :param item: A 0-based index of a column, or a column name, or a list of column names, or a slice. Alternatively, this may also be a two-element tuple where the first element in the tuple is a row selector, and the second element is a row selector. Finally, this can also be a boolean frame indicating which rows/columns to modify. If ``item`` is a column name that does not exist in the frame, then a new column will be appended to the current frame. :param value: The value replacing elements at positions given by ``item``. This can be either a constant, or another frame. """ # TODO: add far stronger type checks, so that we never run in a situation where the server has to # tell us that we requested an illegal operation. assert_is_type(item, str, int, tuple, list, H2OFrame) assert_is_type(value, None, numeric, str, H2OFrame) col_expr = None row_expr = None colname = None # When set, we are doing an append if is_type(item, str): # String column name, could be new or old if item in self.names: col_expr = self.names.index(item) # Update an existing column else: col_expr = self.ncols colname = item # New, append elif is_type(item, int): if not(-self.ncols <= item < self.ncols): raise H2OValueError("Incorrect column index: %d" % item) col_expr = item # Column by number if col_expr < 0: col_expr += self.ncols elif isinstance(item, tuple): # Both row and col specifiers # Need more type checks row_expr = item[0] col_expr = item[1] if is_type(col_expr, str): # Col by name if col_expr not in self.names: # Append colname = col_expr col_expr = self.ncol elif is_type(col_expr, int): if not(-self.ncols <= col_expr < self.ncols): raise H2OValueError("Incorrect column index: %d" % item) if col_expr < 0: col_expr += self.ncols elif isinstance(col_expr, slice): # Col by slice if col_expr.start is None and col_expr.stop is None: col_expr = slice(0, self.ncol) # Slice of all if isinstance(row_expr, slice): start = row_expr.start step = row_expr.step stop = row_expr.stop if start is None: start = 0 if stop is None: stop = self.nrows row_expr = slice(start, stop, step) elif isinstance(item, H2OFrame): row_expr = item # Row slicing elif isinstance(item, list): col_expr = item if value is None: value = float("nan") value_is_own_subframe = isinstance(value, H2OFrame) and self._is_frame_in_self(value) old_cache = self._ex._cache if colname is None: self._ex = ExprNode(":=", self, value, col_expr, row_expr) self._ex._cache.fill_from(old_cache) if isinstance(value, H2OFrame) and \ value._ex._cache.types_valid() and \ self._ex._cache.types_valid(): self._ex._cache._types.update(value._ex._cache.types) else: self._ex._cache.types = None else: self._ex = ExprNode("append", self, value, colname) self._ex._cache.fill_from(old_cache) self._ex._cache.names = self.names + [colname] self._ex._cache._ncols += 1 if self._ex._cache.types_valid() and isinstance(value, H2OFrame) and value._ex._cache.types_valid(): self._ex._cache._types[colname] = list(viewvalues(value._ex._cache.types))[0] else: self._ex._cache.types = None if value_is_own_subframe: value._ex = None # wipe out to keep ref counts correct def _is_frame_in_self(self, frame): if self._ex is frame._ex: return True if frame._ex._children is None: return False return any(self._is_expr_in_self(ch) for ch in frame._ex._children) def _is_expr_in_self(self, expr): if not isinstance(expr, ExprNode): return False if self._ex is expr: return True if expr._children is None: return False return any(self._is_expr_in_self(ch) for ch in expr._children)
[docs] def drop(self, index, axis=1): """ Drop a single column or row or a set of columns or rows from a H2OFrame. Dropping a column or row is not in-place. Indices of rows and columns are zero-based. :param index: A list of column indices, column names, or row indices to drop; or a string to drop a single column by name; or an int to drop a single column by index. :param int axis: If 1 (default), then drop columns; if 0 then drop rows. :returns: a new H2OFrame with the respective dropped columns or rows. The original H2OFrame remains unchanged. """ if axis == 1: if not isinstance(index, list): #If input is a string, i.e., "C1": if is_type(index, str): #Check if index is an actual column(s) in the frame if index not in self.names: raise H2OValueError("Column(s) selected to drop are not in original frame: %r" % index) index = self.names.index(index) #If input is an int indicating a column index, i.e., 3: elif is_type(index, int): #Check if index is an actual column index in the frame if index > self.ncol: raise H2OValueError("Column index selected to drop is not part of the frame: %r" % index) if index < 0: raise H2OValueError("Column index selected to drop is not positive: %r" % index) fr = H2OFrame._expr(expr=ExprNode("cols", self, -(index + 1)), cache=self._ex._cache) fr._ex._cache.ncols -= 1 fr._ex._cache.names = self.names[:index] + self.names[index + 1:] fr._ex._cache.types = {name: self.types[name] for name in fr._ex._cache.names} return fr elif isinstance(index, list): #If input is an int array indicating a column index, i.e., [3] or [1,2,3]: if is_type(index, [int]): if max(index) > self.ncol: raise H2OValueError("Column index selected to drop is not part of the frame: %r" % index) if min(index) < 0: raise H2OValueError("Column index selected to drop is not positive: %r" % index) for i in range(len(index)): index[i] = -(index[i] + 1) #If index is a string array, i.e., ["C1", "C2"] elif is_type(index, [str]): #Check if index is an actual column(s) in the frame if not set(index).issubset(self.names): raise H2OValueError("Column(s) selected to drop are not in original frame: %r" % index) for i in range(len(index)): index[i] = -(self.names.index(index[i]) + 1) fr = H2OFrame._expr(expr=ExprNode("cols", self, index), cache=self._ex._cache) fr._ex._cache.ncols -= len(index) fr._ex._cache.names = [i for i in self.names if self.names.index(i) not in list(map(lambda x: abs(x) - 1, index))] fr._ex._cache.types = {name: fr.types[name] for name in fr._ex._cache.names} else: raise ValueError("Invalid column index types. Must either be a list of all int indexes, " "a string list of all column names, a single int index, or" "a single string for dropping columns.") return fr elif axis == 0: if is_type(index, [int]): #Check if index is an actual column index in the frame if max(index) > self.nrow: raise H2OValueError("Row index selected to drop is not part of the frame: %r" % index) if min(index) < 0: raise H2OValueError("Row index selected to drop is not positive: %r" % index) index = [-(x + 1) for x in index] fr = H2OFrame._expr(expr=ExprNode("rows", self, index), cache=self._ex._cache) fr._ex._cache.nrows -= len(index) else: raise ValueError("Invalid row indexes. Must be a list of int row indexes to drop from the H2OFrame.") return fr
[docs] def pop(self, i): """ Pop a column from the H2OFrame at index i. :param i: The index (int) or name (str) of the column to pop. :returns: an H2OFrame containing the column dropped from the current frame; the current frame is modified in-place and loses the column. """ if is_type(i, str): i = self.names.index(i) col = H2OFrame._expr(expr=ExprNode("cols", self, i)) old_cache = self._ex._cache self._ex = ExprNode("cols", self, -(i + 1)) self._ex._cache.ncols -= 1 self._ex._cache.names = old_cache.names[:i] + old_cache.names[i + 1:] self._ex._cache.types = {name: old_cache.types[name] for name in self._ex._cache.names} self._ex._cache._data = None col._ex._cache.ncols = 1 col._ex._cache.names = [old_cache.names[i]] return col
[docs] def quantile(self, prob=None, combine_method="interpolate", weights_column=None): """ Compute quantiles. :param List[float] prob: list of probabilities for which quantiles should be computed. :param str combine_method: for even samples this setting determines how to combine quantiles. This can be one of ``"interpolate"``, ``"average"``, ``"low"``, ``"high"``. :param weights_column: optional weights for each row. If not given, all rows are assumed to have equal importance. This parameter can be either the name of column containing the observation weights in this frame, or a single-column separate H2OFrame of observation weights. :returns: a new H2OFrame containing the quantiles and probabilities. """ if len(self) == 0: return self if prob is None: prob = [0.01, 0.1, 0.25, 0.333, 0.5, 0.667, 0.75, 0.9, 0.99] if weights_column is None: weights_column = "_" else: assert_is_type(weights_column, str, I(H2OFrame, lambda wc: wc.ncol == 1 and wc.nrow == self.nrow)) if isinstance(weights_column, H2OFrame): merged = self.cbind(weights_column) weights_column = merged.names[-1] return H2OFrame._expr(expr=ExprNode("quantile", merged, prob, combine_method, weights_column)) return H2OFrame._expr(expr=ExprNode("quantile", self, prob, combine_method, weights_column))
[docs] def concat(self, frames, axis=1): """ Append multiple H2OFrames to this frame, column-wise or row-wise. :param List[H2OFrame] frames: list of frames that should be appended to the current frame. :param int axis: if 1 then append column-wise (default), if 0 then append row-wise. :returns: an H2OFrame of the combined datasets. """ if len(frames) == 0: raise ValueError("Input list of frames is empty! Nothing to concat.") if axis == 1: df = self.cbind(frames) else: df = self.rbind(frames) return df
[docs] def cbind(self, data): """ Append data to this frame column-wise. :param H2OFrame data: append columns of frame ``data`` to the current frame. You can also cbind a number, in which case it will get converted into a constant column. :returns: new H2OFrame with all frames in ``data`` appended column-wise. """ assert_is_type(data, H2OFrame, numeric, [H2OFrame, numeric]) frames = [data] if not isinstance(data, list) else data new_cols = list(self.columns) new_types = dict(self.types) for frame in frames: if isinstance(frame, H2OFrame): if frame.nrow != self.nrow: raise H2OValueError("Cannot bind a dataframe with %d rows to a data frame with %d rows: " "the number of rows should match" % (frame.nrow, self.nrow)) new_cols += frame.columns new_types.update(frame.types) else: new_cols += [None] unique_cols = set(new_cols) fr = H2OFrame._expr(expr=ExprNode("cbind", self, *frames), cache=self._ex._cache) fr._ex._cache.ncols = len(new_cols) if len(new_cols) == len(unique_cols) and None not in unique_cols: fr._ex._cache.names = new_cols fr._ex._cache.types = new_types else: # Invalidate names and types since they contain duplicate / unknown names, and the server will choose those. fr._ex._cache.names = None fr._ex._cache.types = None return fr
[docs] def rbind(self, data): """ Append data to this frame row-wise. :param data: an H2OFrame or a list of H2OFrame's to be combined with current frame row-wise. :returns: this H2OFrame with all frames in data appended row-wise. """ assert_is_type(data, H2OFrame, [H2OFrame]) frames = [data] if not isinstance(data, list) else data for frame in frames: if frame.ncol != self.ncol: raise H2OValueError("Cannot row-bind a dataframe with %d columns to a data frame with %d columns: " "the columns must match" % (frame.ncol, self.ncol)) if frame.columns != self.columns or frame.types != self.types: raise H2OValueError("Column names and types must match for rbind() to work") fr = H2OFrame._expr(expr=ExprNode("rbind", self, *frames), cache=self._ex._cache) fr._ex._cache.nrows = self.nrow + sum(frame.nrow for frame in frames) return fr
[docs] def split_frame(self, ratios=None, destination_frames=None, seed=None): """ Split a frame into distinct subsets of size determined by the given ratios. The number of subsets is always 1 more than the number of ratios given. Note that this does not give an exact split. H2O is designed to be efficient on big data using a probabilistic splitting method rather than an exact split. For example when specifying a split of 0.75/0.25, H2O will produce a test/train split with an expected value of 0.75/0.25 rather than exactly 0.75/0.25. On small datasets, the sizes of the resulting splits will deviate from the expected value more than on big data, where they will be very close to exact. :param List[float] ratios: The fractions of rows for each split. :param List[str] destination_frames: The names of the split frames. :param int seed: seed for the random number generator :returns: A list of H2OFrames """ assert_is_type(ratios, [numeric], None) assert_is_type(destination_frames, [str], None) assert_is_type(seed, int, None) if ratios is None: ratios = [0.75] if not ratios: raise ValueError("Ratios array may not be empty") if destination_frames is not None: if len(ratios) + 1 != len(destination_frames): raise ValueError("The number of provided destination_frames must be one more " "than the number of provided ratios") num_slices = len(ratios) + 1 boundaries = [] last_boundary = 0 i = 0 while i < num_slices - 1: ratio = ratios[i] if ratio < 0: raise ValueError("Ratio must be greater than 0") boundary = last_boundary + ratio if boundary >= 1.0: raise ValueError("Ratios must add up to less than 1.0") boundaries.append(boundary) last_boundary = boundary i += 1 splits = [] tmp_runif = self.runif(seed) tmp_runif.frame_id = "%s_splitter" % _py_tmp_key(h2o.connection().session_id) i = 0 while i < num_slices: if i == 0: # lower_boundary is 0.0 upper_boundary = boundaries[i] tmp_slice = self[(tmp_runif <= upper_boundary), :] elif i == num_slices - 1: lower_boundary = boundaries[i - 1] # upper_boundary is 1.0 tmp_slice = self[(tmp_runif > lower_boundary), :] else: lower_boundary = boundaries[i - 1] upper_boundary = boundaries[i] tmp_slice = self[((tmp_runif > lower_boundary) & (tmp_runif <= upper_boundary)), :] if destination_frames is None: splits.append(tmp_slice) else: destination_frame_id = destination_frames[i] tmp_slice.frame_id = destination_frame_id splits.append(tmp_slice) i += 1 for split in splits: split.refresh() # Force the split now (otherwise done lazily) to immediately delete tmp_runif h2o.remove(tmp_runif) del tmp_runif return splits
[docs] def group_by(self, by): """ Return a new ``GroupBy`` object using this frame and the desired grouping columns. The returned groups are sorted by the natural group-by column sort. :param by: The columns to group on (either a single column name, or a list of column names, or a list of column indices). """ assert_is_type(by, str, int, [str, int]) return GroupBy(self, by)
[docs] def sort(self, by, ascending=[]): """ Return a new Frame that is sorted by column(s) in ascending order. A fully distributed and parallel sort. However, the original frame can contain String columns but sorting cannot be done on String columns. Default sorting direction is ascending. :param by: The column to sort by (either a single column name, or a list of column names, or a list of column indices) :param ascending: Boolean array to denote sorting direction for each sorting column. True for ascending sort and False for descending sort. :return: a new sorted Frame """ assert_is_type(by, str, int, [str, int]) if type(by) != list: by = [by] if type(ascending) != list: ascending = [ascending] # convert to list ascendingI=[1]*len(by) # intitalize sorting direction to ascending by default for c in by: if self.type(c) not in ["enum","time","int","real","string"]: raise H2OValueError("Sort by column: " + str(c) + " not of enum, time, int, real, or string type") if len(ascending)>0: # user did not specify sort direction, assume all columns ascending assert len(ascending)==len(by), "Sorting direction must be specified for each sorted column." for index in range(len(by)): ascendingI[index]=1 if ascending[index] else -1 return H2OFrame._expr(expr=ExprNode("sort",self,by,ascendingI))
[docs] def fillna(self,method="forward",axis=0,maxlen=1): """ Return a new Frame that fills NA along a given axis and along a given direction with a maximum fill length :param method: ``"forward"`` or ``"backward"`` :param axis: 0 for columnar-wise or 1 for row-wise fill :param maxlen: Max number of consecutive NA's to fill :returns: A new Frame that fills NA along a given axis and along a given direction with a maximum fill length. """ assert_is_type(axis, 0, 1) assert_is_type(method,str) assert_is_type(maxlen, int) return H2OFrame._expr(expr=ExprNode("h2o.fillna",self,method,axis,maxlen))
[docs] def impute(self, column=-1, method="mean", combine_method="interpolate", by=None, group_by_frame=None, values=None): """ Impute missing values into the frame, modifying it in-place. :param int column: Index of the column to impute, or -1 to impute the entire frame. :param str method: The method of imputation: ``"mean"``, ``"median"``, or ``"mode"``. :param str combine_method: When the method is ``"median"``, this setting dictates how to combine quantiles for even samples. One of ``"interpolate"``, ``"average"``, ``"low"``, ``"high"``. :param by: The list of columns to group on. :param H2OFrame group_by_frame: Impute the values with this pre-computed grouped frame. :param List values: The list of impute values, one per column. None indicates to skip the column. :returns: A list of values used in the imputation or the group-by result used in imputation. """ if is_type(column, str): column = self.names.index(column) if is_type(by, str): by = self.names.index(by) if values is None: values = "_" else: assert len(values) == len(self.columns), "Length of values does not match length of columns" # convert string values to categorical num values values2 = [] for i in range(0,len(values)): if self.type(i) == "enum": try: values2.append(self.levels()[i].index(values[i])) except: raise H2OValueError("Impute value of: " + values[i] + " not found in existing levels of" " column: " + self.col_names[i]) else: values2.append(values[i]) values = values2 if group_by_frame is None: group_by_frame = "_" # This code below is needed to ensure the frame (self) exists on the server. Without it, self._ex._cache.fill() # fails with an assertion that ._id is None. # This code should be removed / reworked once we have a more consistent strategy of dealing with frames. self._ex._eager_frame() if by is not None or group_by_frame is not "_": res = H2OFrame._expr( expr=ExprNode("h2o.impute", self, column, method, combine_method, by, group_by_frame, values))._frame() else: res = ExprNode("h2o.impute", self, column, method, combine_method, by, group_by_frame, values)._eager_scalar() self._ex._cache.flush() self._ex._cache.fill(10) return res
[docs] def merge(self, other, all_x=False, all_y=False, by_x=None, by_y=None, method="auto"): """ Merge two datasets based on common column names. We do not support all_x=True and all_y=True. Only one can be True or none is True. The default merge method is auto and it will default to the radix method. The radix method will return the correct merge result regardless of duplicated rows in the right frame. In addition, the radix method can perform merge even if you have string columns in your frames. If there are duplicated rows in your rite frame, they will not be included if you use the hash method. The hash method cannot perform merge if you have string columns in your left frame. Hence, we consider the radix method superior to the hash method and is the default method to use. :param H2OFrame other: The frame to merge to the current one. By default, must have at least one column in common with this frame, and all columns in common are used as the merge key. If you want to use only a subset of the columns in common, rename the other columns so the columns are unique in the merged result. :param bool all_x: If True, include all rows from the left/self frame :param bool all_y: If True, include all rows from the right/other frame :param by_x: list of columns in the current frame to use as a merge key. :param by_y: list of columns in the ``other`` frame to use as a merge key. Should have the same number of columns as in the ``by_x`` list. :param method: string representing the merge method, one of auto(default), radix or hash. :returns: New H2OFrame with the result of merging the current frame with the ``other`` frame. """ if by_x is None and by_y is None: common_names = list(set(self.names) & set(other.names)) if not common_names: raise H2OValueError("No columns in common to merge on!") if by_x is None: by_x = [self.names.index(c) for c in common_names] else: by_x = _getValidCols(by_x,self) if by_y is None: by_y = [other.names.index(c) for c in common_names] else: by_y = _getValidCols(by_y,other) return H2OFrame._expr(expr=ExprNode("merge", self, other, all_x, all_y, by_x, by_y, method))
[docs] def relevel(self, y): """ Reorder levels of an H2O factor for one single column of a H2O frame The levels of a factor are reordered such that the reference level is at level 0, all remaining levels are moved down as needed. :param str y: The reference level :returns: New reordered factor column """ return H2OFrame._expr(expr=ExprNode("relevel", self, quote(y)))
[docs] def insert_missing_values(self, fraction=0.1, seed=None): """ Insert missing values into the current frame, modifying it in-place. Randomly replaces a user-specified fraction of entries in a H2O dataset with missing values. :param float fraction: A number between 0 and 1 indicating the fraction of entries to replace with missing. :param int seed: The seed for the random number generator used to determine which values to make missing. :returns: the original H2OFrame with missing values inserted. """ kwargs = {} kwargs['dataset'] = self.frame_id # Eager; forces eval now for following REST call kwargs['fraction'] = fraction if seed is not None: kwargs['seed'] = seed job = {} job['job'] = h2o.api("POST /3/MissingInserter", data=kwargs) H2OJob(job, job_type=("Insert Missing Values")).poll() self._ex._cache.flush() return self
[docs] def min(self): """The minimum value of all frame entries.""" return ExprNode("min", self)._eager_scalar()
[docs] def max(self): """The maximum value of all frame entries.""" return ExprNode("max", self)._eager_scalar()
[docs] def sum(self, skipna=True, axis=0, **kwargs): """ Compute the frame's sum by-column (or by-row). :param bool skipna: If True (default), then NAs are ignored during the computation. Otherwise presence of NAs renders the entire result NA. :param int axis: Direction of sum computation. If 0 (default), then sum is computed columnwise, and the result is a frame with 1 row and number of columns as in the original frame. If 1, then sum is computed rowwise and the result is a frame with 1 column (called "sum"), and number of rows equal to the number of rows in the original frame. For row or column sums, the ``return_frame`` parameter must be True. :param bool return_frame: A boolean parameter that indicates whether to return an H2O frame or one single aggregated value. Default is False. :returns: either an aggregated value with sum of values per-column (old semantic); or an H2OFrame containing sum of values per-column/per-row in the original frame (new semantic). The new semantic is triggered by either providing the ``return_frame=True`` parameter, or having the ``general.allow_breaking_changed`` config option turned on. """ assert_is_type(skipna, bool) assert_is_type(axis, 0, 1) # Deprecated since 2016-10-14, if "na_rm" in kwargs: warnings.warn("Parameter na_rm is deprecated; use skipna instead", category=DeprecationWarning) na_rm = kwargs.pop("na_rm") assert_is_type(na_rm, bool) skipna = na_rm # don't assign to skipna directly, to help with error reporting # Determine whether to return a frame or a list return_frame = get_config_value("general.allow_breaking_changes", False) if "return_frame" in kwargs: return_frame = kwargs.pop("return_frame") assert_is_type(return_frame, bool) if kwargs: raise H2OValueError("Unknown parameters %r" % list(kwargs)) if return_frame: return H2OFrame._expr(ExprNode("sumaxis", self, skipna, axis)) else: return ExprNode("sumNA" if skipna else "sum", self)._eager_scalar()
[docs] def mean(self, skipna=True, axis=0, **kwargs): """ Compute the frame's means by-column (or by-row). :param bool skipna: If True (default), then NAs are ignored during the computation. Otherwise presence of NAs renders the entire result NA. :param int axis: Direction of mean computation. If 0 (default), then mean is computed columnwise, and the result is a frame with 1 row and number of columns as in the original frame. If 1, then mean is computed rowwise and the result is a frame with 1 column (called "mean"), and number of rows equal to the number of rows in the original frame. :returns: either a list of mean values per-column (old semantic); or an H2OFrame containing mean values per-column/per-row from the original frame (new semantic). The new semantic is triggered by either providing the ``return_frame=True`` parameter, or having the ``general.allow_breaking_changed`` config option turned on. """ assert_is_type(skipna, bool) assert_is_type(axis, 0, 1) # Deprecated since 2016-10-14, if "na_rm" in kwargs: warnings.warn("Parameter na_rm is deprecated; use skipna instead", category=DeprecationWarning) na_rm = kwargs.pop("na_rm") assert_is_type(na_rm, bool) skipna = na_rm # don't assign to skipna directly, to help with error reporting # Determine whether to return a frame or a list return_frame = get_config_value("general.allow_breaking_changes", False) if "return_frame" in kwargs: return_frame = kwargs.pop("return_frame") assert_is_type(return_frame, bool) if kwargs: raise H2OValueError("Unknown parameters %r" % list(kwargs)) new_frame = H2OFrame._expr(ExprNode("mean", self, skipna, axis)) if return_frame: return new_frame else: return new_frame.getrow()
[docs] def skewness(self, na_rm=False): """ Compute the skewness of each column in the frame. :param bool na_rm: If True, then ignore NAs during the computation. :returns: A list containing the skewness for each column (NaN for non-numeric columns). """ return ExprNode("skewness", self, na_rm)._eager_scalar()
[docs] def kurtosis(self, na_rm=False): """ Compute the kurtosis of each column in the frame. We calculate the common kurtosis, such that kurtosis(normal distribution) is 3. :param bool na_rm: If True, then ignore NAs during the computation. :returns: A list containing the kurtosis for each column (NaN for non-numeric columns). """ return ExprNode("kurtosis", self, na_rm)._eager_scalar()
[docs] def nacnt(self): """ Count of NAs for each column in this H2OFrame. :returns: A list of the na counts (one entry per column). """ return ExprNode("naCnt", self)._eager_scalar()
[docs] def median(self, na_rm=False): """ Compute the median of each column in the frame. :param bool na_rm: If True, then ignore NAs during the computation. :returns: A list containing the median for each column (NaN for non-numeric columns). """ return ExprNode("median", self, na_rm)._eager_scalar()
[docs] def var(self, y=None, na_rm=False, use=None): """ Compute the variance-covariance matrix of one or two H2OFrames. :param H2OFrame y: If this parameter is given, then a covariance matrix between the columns of the target frame and the columns of ``y`` is computed. If this parameter is not provided then the covariance matrix of the target frame is returned. If target frame has just a single column, then return the scalar variance instead of the matrix. Single rows are treated as single columns. :param str use: A string indicating how to handle missing values. This could be one of the following: - ``"everything"``: outputs NaNs whenever one of its contributing observations is missing - ``"all.obs"``: presence of missing observations will throw an error - ``"complete.obs"``: discards missing values along with all observations in their rows so that only complete observations are used :param bool na_rm: an alternative to ``use``: when this is True then default value for ``use`` is ``"everything"``; and if False then default ``use`` is ``"complete.obs"``. This parameter has no effect if ``use`` is given explicitly. :returns: An H2OFrame of the covariance matrix of the columns of this frame (if ``y`` is not given), or with the columns of ``y`` (if ``y`` is given). However when this frame and ``y`` are both single rows or single columns, then the variance is returned as a scalar. """ symmetric = False if y is None: y = self symmetric = True if use is None: use = "complete.obs" if na_rm else "everything" if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("var", self, y, use, symmetric)._eager_scalar() return H2OFrame._expr(expr=ExprNode("var", self, y, use, symmetric))._frame()
[docs] def sd(self, na_rm=False): """ Compute the standard deviation for each column in the frame. :param bool na_rm: if True, then NAs will be removed from the computation. :returns: A list containing the standard deviation for each column (NaN for non-numeric columns). """ return ExprNode("sd", self, na_rm)._eager_scalar()
[docs] def cor(self, y=None, na_rm=False, use=None): """ Compute the correlation matrix of one or two H2OFrames. :param H2OFrame y: If this parameter is provided, then compute correlation between the columns of ``y`` and the columns of the current frame. If this parameter is not given, then just compute the correlation matrix for the columns of the current frame. :param str use: A string indicating how to handle missing values. This could be one of the following: - ``"everything"``: outputs NaNs whenever one of its contributing observations is missing - ``"all.obs"``: presence of missing observations will throw an error - ``"complete.obs"``: discards missing values along with all observations in their rows so that only complete observations are used :param bool na_rm: an alternative to ``use``: when this is True then default value for ``use`` is ``"everything"``; and if False then default ``use`` is ``"complete.obs"``. This parameter has no effect if ``use`` is given explicitly. :returns: An H2OFrame of the correlation matrix of the columns of this frame (if ``y`` is not given), or with the columns of ``y`` (if ``y`` is given). However when this frame and ``y`` are both single rows or single columns, then the correlation is returned as a scalar. """ assert_is_type(y, H2OFrame, None) assert_is_type(na_rm, bool) assert_is_type(use, None, "everything", "all.obs", "complete.obs") if y is None: y = self if use is None: use = "complete.obs" if na_rm else "everything" if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("cor", self, y, use)._eager_scalar() return H2OFrame._expr(expr=ExprNode("cor", self, y, use))._frame()
[docs] def distance(self, y, measure=None): """ Compute a pairwise distance measure between all rows of two numeric H2OFrames. :param H2OFrame y: Frame containing queries (small) :param str use: A string indicating what distance measure to use. Must be one of: - ``"l1"``: Absolute distance (L1-norm, >=0) - ``"l2"``: Euclidean distance (L2-norm, >=0) - ``"cosine"``: Cosine similarity (-1...1) - ``"cosine_sq"``: Squared Cosine similarity (0...1) :examples: >>> >>> iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) >>> references = iris_h2o[10:150,0:4 >>> queries = iris_h2o[0:10,0:4] >>> A = references.distance(queries, "l1") >>> B = references.distance(queries, "l2") >>> C = references.distance(queries, "cosine") >>> D = references.distance(queries, "cosine_sq") >>> E = queries.distance(references, "l1") >>> (E.transpose() == A).all() :returns: An H2OFrame of the matrix containing pairwise distance / similarity between the rows of this frame (N x p) and ``y`` (M x p), with dimensions (N x M). """ assert_is_type(y, H2OFrame) if measure is None: measure = "l2" return H2OFrame._expr(expr=ExprNode("distance", self, y, measure))._frame()
[docs] def strdistance(self, y, measure=None, compare_empty=True): """ Compute element-wise string distances between two H2OFrames. Both frames need to have the same shape and only contain string/factor columns. :param H2OFrame y: A comparison frame. :param str measure: A string identifier indicating what string distance measure to use. Must be one of: - ``"lv"``: Levenshtein distance - ``"lcs"``: Longest common substring distance - ``"qgram"``: q-gram distance - ``"jaccard"``: Jaccard distance between q-gram profiles - ``"jw"``: Jaro, or Jaro-Winker distance - ``"soundex"``: Distance based on soundex encoding :param compare_empty: if set to FALSE, empty strings will be handled as NaNs :returns: An H2OFrame of the matrix containing element-wise distance between the strings of this frame and ``y``. The returned frame has the same shape as the input frames. :examples: >>> >>> x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'], column_types=['factor']) >>> y = h2o.H2OFrame.from_python(['Marhta', 'Duane', 'Dicksonx'], column_types=['string']) >>> x.strdistance(y, measure="jw") """ assert_is_type(y, H2OFrame) assert_is_type(measure, Enum('lv', 'lcs', 'qgram', 'jaccard', 'jw', 'soundex')) assert_is_type(compare_empty, bool) return H2OFrame._expr(expr=ExprNode("strDistance", self, y, measure, compare_empty))._frame()
[docs] def asfactor(self): """ Convert columns in the current frame to categoricals. :returns: new H2OFrame with columns of the "enum" type. """ for colname in self.names: t = self.types[colname] if t not in {"bool", "int", "string", "enum"}: raise H2OValueError("Only 'int' or 'string' are allowed for " "asfactor(), got %s:%s " % (colname, t)) fr = H2OFrame._expr(expr=ExprNode("as.factor", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {name: "enum" for name in self.types} else: raise H2OTypeError("Types are not available in result") return fr
[docs] def isfactor(self): """ Test which columns in the current frame are categorical. :returns: a list of True/False indicating for each column in the frame whether it is categorical. """ return [bool(o) for o in ExprNode("is.factor", self)._eager_scalar()]
[docs] def anyfactor(self): """Return True if there are any categorical columns in the frame.""" return bool(ExprNode("any.factor", self)._eager_scalar())
[docs] def categories(self): """ Return the list of levels for an enum (categorical) column. This function can only be applied to single-column categorical frame. """ if self.ncols != 1: raise H2OValueError("This operation only applies to a single factor column") if self.types[self.names[0]] != "enum": raise H2OValueError("Input is not a factor. This operation only applies to a single factor column") return self.levels()[0]
[docs] def transpose(self): """ Transpose rows and columns of this frame. :returns: new H2OFrame where with rows/columns from the original frame transposed. """ return H2OFrame._expr(expr=ExprNode("t", self))
[docs] def strsplit(self, pattern): """ Split the strings in the target column on the given regular expression pattern. :param str pattern: The split pattern. :returns: H2OFrame containing columns of the split strings. """ fr = H2OFrame._expr(expr=ExprNode("strsplit", self, pattern)) fr._ex._cache.nrows = self.nrow return fr
[docs] def tokenize(self, split): """ Tokenize String tokenize() is similar to strsplit(), the difference between them is that tokenize() will store the tokenized text into a single column making it easier for additional processing (filtering stop words, word2vec algo, ...). :param str split: The regular expression to split on. :returns: An H2OFrame with a single column representing the tokenized Strings. Original rows of the input DF are separated by NA. """ fr = H2OFrame._expr(expr=ExprNode("tokenize", self, split)) return fr
[docs] def countmatches(self, pattern): """ For each string in the frame, count the occurrences of the provided pattern. If countmathces is applied to a frame, all columns of the frame must be type string, otherwise, the returned frame will contain errors. The pattern here is a plain string, not a regular expression. We will search for the occurrences of the pattern as a substring in element of the frame. This function is applicable to frames containing only string or categorical columns. :param str pattern: The pattern to count matches on in each string. This can also be a list of strings, in which case all of them will be searched for. :returns: numeric H2OFrame with the same shape as the original, containing counts of matches of the pattern for each cell in the original frame. """ assert_is_type(pattern, str, [str]) fr = H2OFrame._expr(expr=ExprNode("countmatches", self, pattern)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncols = self.ncol return fr
[docs] def trim(self): """ Trim white space on the left and right of strings in a single-column H2OFrame. :returns: H2OFrame with trimmed strings. """ fr = H2OFrame._expr(expr=ExprNode("trim", self)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def substring(self, start_index, end_index=None): """ For each string, return a new string that is a substring of the original string. If end_index is not specified, then the substring extends to the end of the original string. If the start_index is longer than the length of the string, or is greater than or equal to the end_index, an empty string is returned. Negative start_index is coerced to 0. :param int start_index: The index of the original string at which to start the substring, inclusive. :param int end_index: The index of the original string at which to end the substring, exclusive. :returns: An H2OFrame containing the specified substrings. """ fr = H2OFrame._expr(expr=ExprNode("substring", self, start_index, end_index)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def lstrip(self, set=" "): """ Return a copy of the column with leading characters removed. The set argument is a string specifying the set of characters to be removed. If omitted, the set argument defaults to removing whitespace. :param character set: The set of characters to lstrip from strings in column. :returns: a new H2OFrame with the same shape as the original frame and having all its values trimmed from the left (equivalent of Python's ``str.lstrip()``). """ # work w/ None; parity with python lstrip if set is None: set = " " fr = H2OFrame._expr(expr=ExprNode("lstrip", self, set)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def rstrip(self, set=" "): """ Return a copy of the column with trailing characters removed. The set argument is a string specifying the set of characters to be removed. If omitted, the set argument defaults to removing whitespace. :param character set: The set of characters to rstrip from strings in column :returns: a new H2OFrame with the same shape as the original frame and having all its values trimmed from the right (equivalent of Python's ``str.rstrip()``). """ # work w/ None; parity with python rstrip if set is None: set = " " fr = H2OFrame._expr(expr=ExprNode("rstrip", self, set)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def entropy(self): """ For each string compute its Shannon entropy, if the string is empty the entropy is 0. :returns: an H2OFrame of Shannon entropies. """ fr = H2OFrame._expr(expr=ExprNode("entropy", self)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def num_valid_substrings(self, path_to_words): """ For each string, find the count of all possible substrings with 2 characters or more that are contained in the line-separated text file whose path is given. :param str path_to_words: Path to file that contains a line-separated list of strings considered valid. :returns: An H2OFrame with the number of substrings that are contained in the given word list. """ assert_is_type(path_to_words, str) fr = H2OFrame._expr(expr=ExprNode("num_valid_substrings", self, path_to_words)) fr._ex._cache.nrows = self.nrow fr._ex._cache.ncol = self.ncol return fr
[docs] def nchar(self): """ Count the length of each string in a single-column H2OFrame of string type. :returns: A single-column H2OFrame containing the per-row character count. """ return H2OFrame._expr(expr=ExprNode("strlen", self))
[docs] def table(self, data2=None, dense=True): """ Compute the counts of values appearing in a column, or co-occurence counts between two columns. :param H2OFrame data2: An optional single column to aggregate counts by. :param bool dense: If True (default) then use dense representation, which lists only non-zero counts, 1 combination per row. Set to False to expand counts across all combinations. :returns: H2OFrame of the counts at each combination of factor levels """ return H2OFrame._expr(expr=ExprNode("table", self, data2, dense)) if data2 is not None else H2OFrame._expr( expr=ExprNode("table", self, dense))
[docs] def hist(self, breaks="sturges", plot=True, **kwargs): """ Compute a histogram over a numeric column. :param breaks: Can be one of ``"sturges"``, ``"rice"``, ``"sqrt"``, ``"doane"``, ``"fd"``, ``"scott"``; or a single number for the number of breaks; or a list containing the split points, e.g: ``[-50, 213.2123, 9324834]``. If breaks is "fd", the MAD is used over the IQR in computing bin width. :param bool plot: If True (default), then a plot will be generated using ``matplotlib``. :returns: If ``plot`` is False, return H2OFrame with these columns: breaks, counts, mids_true, mids, and density; otherwise this method draws a plot and returns nothing. """ server = kwargs.pop("server") if "server" in kwargs else False assert_is_type(breaks, int, [numeric], Enum("sturges", "rice", "sqrt", "doane", "fd", "scott")) assert_is_type(plot, bool) assert_is_type(server, bool) if kwargs: raise H2OValueError("Unknown parameters to hist(): %r" % kwargs) hist = H2OFrame._expr(expr=ExprNode("hist", self, breaks))._frame() if plot: try: import matplotlib if server: matplotlib.use("Agg", warn=False) import matplotlib.pyplot as plt except ImportError: print("ERROR: matplotlib is required to make the histogram plot. " "Set `plot` to False, if a plot is not desired.") return hist["widths"] = hist["breaks"].difflag1() # [2:] because we're removing the title and the first row (which consists of NaNs) lefts = [float(c[0]) for c in h2o.as_list(hist["breaks"], use_pandas=False)[2:]] widths = [float(c[0]) for c in h2o.as_list(hist["widths"], use_pandas=False)[2:]] counts = [float(c[0]) for c in h2o.as_list(hist["counts"], use_pandas=False)[2:]] plt.xlabel(self.names[0]) plt.ylabel("Frequency") plt.title("Histogram of %s" % self.names[0]) # matplotlib deprecated "left" arg in 2.1.0 and removed in 3.0.0 version_number = matplotlib.__version__ major = version_number.split('.')[0] minor = version_number.split('.')[1] major = int(major) minor = int(minor) if major == 2 and minor >= 1 or major >= 3: plt.bar(x=lefts, width=widths, height=counts, bottom=0) else: plt.bar(left=lefts, height=counts, width=widths, bottom=0) if not server: plt.show() else: hist["density"] = hist["counts"] / (hist["breaks"].difflag1() * hist["counts"].sum()) return hist
[docs] def isax(self, num_words, max_cardinality, optimize_card=False, **kwargs): """ Compute the iSAX index for DataFrame which is assumed to be numeric time series data. References: - http://www.cs.ucr.edu/~eamonn/SAX.pdf - http://www.cs.ucr.edu/~eamonn/iSAX_2.0.pdf :param int num_words: Number of iSAX words for the timeseries, i.e. granularity along the time series :param int max_cardinality: Maximum cardinality of the iSAX word. Each word can have less than the max :param bool optimized_card: An optimization flag that will find the max cardinality regardless of what is passed in for ``max_cardinality``. :returns: An H2OFrame with the name of time series, string representation of iSAX word, followed by binary representation. """ if num_words <= 0: raise H2OValueError("num_words must be greater than 0") if max_cardinality <= 0: raise H2OValueError("max_cardinality must be greater than 0") return H2OFrame._expr(expr=ExprNode("isax", self, num_words, max_cardinality, optimize_card))
[docs] def convert_H2OFrame_2_DMatrix(self, predictors, yresp, h2oXGBoostModel): ''' This method requires that you import the following toolboxes: xgboost, pandas, numpy and scipy.sparse. This method will convert an H2OFrame to a DMatrix that can be used by native XGBoost. The H2OFrame contains numerical and enum columns alone. Note that H2O one-hot-encoding introduces a missing(NA) column. There can be NAs in any columns. Follow the steps below to compare H2OXGBoost and native XGBoost: 1. Train the H2OXGBoost model with H2OFrame trainFile and generate a prediction: h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # parameters specified as a dict() h2oModelD.train(x=myX, y=y, training_frame=trainFile) # train with H2OFrame trainFile h2oPredict = h2oPredictD = h2oModelD.predict(trainFile) 2. Derive the DMatrix from H2OFrame: nativeDMatrix = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) 3. Derive the parameters for native XGBoost: nativeParams = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() 4. Train your native XGBoost model and generate a prediction: nativeModel = xgb.train(params=nativeParams[0], dtrain=nativeDMatrix, num_boost_round=nativeParams[1]) nativePredict = nativeModel.predict(data=nativeDMatrix, ntree_limit=nativeParams[1]. 5. Compare the predictions h2oPredict from H2OXGBoost, nativePredict from native XGBoost. :param h2oFrame: H2OFrame to be converted to DMatrix for native XGBoost :param predictors: List of predictor columns, can be column names or indices :param yresp: response column, can be column index or name :param h2oXGBoostModel: H2OXGboost model that are built with the same H2OFrame as input earlier :return: DMatrix that can be an input to a native XGBoost model ''' import xgboost as xgb import pandas as pd import numpy as np from scipy.sparse import csr_matrix assert isinstance(predictors, list) or isinstance(predictors, tuple) assert h2oXGBoostModel._model_json['algo'] == 'xgboost', \ "convert_H2OFrame_2_DMatrix is used for H2OXGBoost model only." tempFrame = self[predictors].cbind(self[yresp]) colnames = tempFrame.names if type(predictors[0])==type(1): # convert integer indices to column names temp = [] for colInd in predictors: temp.append(colnames[colInd]) predictors = temp if (type(yresp) == type(1)): tempy = colnames[yresp] yresp = tempy # column name of response column enumCols = [] # extract enum columns out to process them enumColsIndices = [] # store enum column indices typeDict = self.types for predName in predictors: if str(typeDict[predName])=='enum': enumCols.append(predName) enumColsIndices.append(colnames.index(predName)) pandaFtrain = tempFrame.as_data_frame(use_pandas=True, header=True) nrows = tempFrame.nrow # convert H2OFrame to DMatrix starts here if len(enumCols) > 0: # enumCols contain all enum column names allDomain = tempFrame.levels() # list all domain levels with column indices domainLen = [] for enumIndex in enumColsIndices: if len(allDomain[enumIndex])>0: domainLen.append(len(allDomain[enumIndex])*-1) incLevel = np.argsort(domainLen) # indices of enum column indices with decreasing domain length # need to move enum columns to the front, highest level first c2 = tempFrame[enumCols[incLevel[0]]] tempFrame = tempFrame.drop(enumCols[incLevel[0]]) for index in range(1, len(incLevel)): c2 = c2.cbind(tempFrame[enumCols[incLevel[index]]]) tempFrame = tempFrame.drop(enumCols[incLevel[index]]) enumCols = c2.names tempFrame = c2.cbind(tempFrame) pandaFtrain = tempFrame.as_data_frame(use_pandas=True, header=True) # redo translation from H2O to panda pandaTrainPart = generatePandaEnumCols(pandaFtrain, enumCols[0], nrows, tempFrame[enumCols[0]].categories()) pandaFtrain.drop([enumCols[0]], axis=1, inplace=True) for colInd in range(1, len(enumCols)): cname=enumCols[colInd] ctemp = generatePandaEnumCols(pandaFtrain, cname, nrows, tempFrame[enumCols[colInd]].categories()) pandaTrainPart=pd.concat([pandaTrainPart, ctemp], axis=1) pandaFtrain.drop([cname], axis=1, inplace=True) pandaFtrain = pd.concat([pandaTrainPart, pandaFtrain], axis=1) c0= tempFrame[yresp].asnumeric().as_data_frame(use_pandas=True, header=True) pandaFtrain.drop([yresp], axis=1, inplace=True) pandaF = pd.concat([c0, pandaFtrain], axis=1) pandaF.rename(columns={c0.columns[0]:yresp}, inplace=True) newX = list(pandaFtrain.columns.values) data = pandaF.as_matrix(newX) label = pandaF.as_matrix([yresp]) return xgb.DMatrix(data=csr_matrix(data), label=label) \ if h2oXGBoostModel._model_json['output']['sparse'] else xgb.DMatrix(data=data, label=label)
[docs] def pivot(self, index, column, value): """ Pivot the frame designated by the three columns: index, column, and value. Index and column should be of type enum, int, or time. For cases of multiple indexes for a column label, the aggregation method is to pick the first occurrence in the data frame. :param index: Index is a column that will be the row label :param column: The labels for the columns in the pivoted Frame :param value: The column of values for the given index and column label :returns: Returns a new H2OFrame with pivoted columns. """ assert_is_type(index, str) assert_is_type(column, str) assert_is_type(value, str) col_names = self.names if index not in col_names: raise H2OValueError("Index not in H2OFrame") if column not in col_names: raise H2OValueError("Column not in H2OFrame") if value not in col_names: raise H2OValueError("Value column not in H2OFrame") if self.type(column) not in ["enum","time","int"]: raise H2OValueError("'column' argument is not type enum, time or int") if self.type(index) not in ["enum","time","int"]: raise H2OValueError("'index' argument is not type enum, time or int") return H2OFrame._expr(expr=ExprNode("pivot",self,index,column,value))
[docs] def rank_within_group_by(self, group_by_cols, sort_cols, ascending=[], new_col_name="New_Rank_column", sort_cols_sorted=False): """ This function will add a new column rank where the ranking is produced as follows: 1. Sorts the H2OFrame by columns sorted in by columns specified in group_by_cols and sort_cols in the directions specified by the ascending for the sort_cols. The sort directions for the group_by_cols are ascending only. 2. A new rank column is added to the frame which will contain a rank assignment performed next. The user can choose to assign a name to this new column. The default name is New_Rank_column. 3. For each groupby groups, a rank is assigned to the row starting from 1, 2, ... to the end of that group. 4. If sort_cols_sorted is TRUE, a final sort on the frame will be performed frame according to the sort_cols and the sort directions in ascending. If sort_cols_sorted is FALSE (by default), the frame from step 3 will be returned as is with no extra sort. This may provide a small speedup if desired. :param group_by_cols: The columns to group on (either a single column name/index, or a list of column names or column indices :param sort_cols: The columns to sort on (either a single column name/index, or a list of column names or column indices :param ascending: Optional Boolean array to denote sorting direction for each sorting column. True for ascending, False for descending. Default is ascending sort. Sort direction for enums will be ignored. :param new_col_name: Optional String to denote the new column names. Default to New_Rank_column. :param sort_cols_sorted: Optional Boolean to denote if the returned frame should be sorted according to sort_cols and sort directions specified in ascending. Default is False. :returns: A new Frame with new rank (sorted by columns in sort_cols) column within the grouping specified by the group_by_cols. :examples: >>> #If the input frame is train: >>> ID Group_by_column num data Column_to_arrange_by num_1 fdata >>> 12 1 2941.552 1 3 -3177.9077 1 >>> 12 1 2941.552 1 5 -13311.8247 1 >>> 12 2 -22722.174 1 3 -3177.9077 1 >>> 12 2 -22722.174 1 5 -13311.8247 1 >>> 13 3 -12776.884 1 5 -18421.6171 0 >>> 13 3 -12776.884 1 4 28080.1607 0 >>> 13 1 -6049.830 1 5 -18421.6171 0 >>> 13 1 -6049.830 1 4 28080.1607 0 >>> 15 3 -16995.346 1 1 -9781.6373 0 >>> 16 1 -10003.593 0 3 -61284.6900 0 >>> 16 3 26052.495 1 3 -61284.6900 0 >>> 16 3 -22905.288 0 3 -61284.6900 0 >>> 17 2 -13465.496 1 2 12094.4851 1 >>> 17 2 -13465.496 1 3 -11772.1338 1 >>> 17 2 -13465.496 1 3 -415.1114 0 >>> 17 2 -3329.619 1 2 12094.4851 1 >>> 17 2 -3329.619 1 3 -11772.1338 1 >>> 17 2 -3329.619 1 3 -415.1114 0 >>> >>> #If the following commands are issued: >>> rankedF1 = h2o.rank_within_group_by(train, ["Group_by_column"], ["Column_to_arrange_by"], >>> [TRUE]) >>> rankedF1.summary() >>> >>> #The returned frame rankedF1 will look like this: >>> ID Group_by_column num fdata Column_to_arrange_by num_1 fdata.1 New_Rank_column >>> 12 1 2941.552 1 3 -3177.9077 1 1 >>> 13 1 -6049.830 0 4 28080.1607 0 3 >>> 12 1 2941.552 1 5 -13311.8247 1 4 >>> 13 1 -6049.830 0 5 -18421.6171 0 5 >>> 17 2 -13465.496 0 2 12094.4851 1 1 >>> 17 2 -3329.619 0 2 12094.4851 1 2 >>> 12 2 -22722.174 1 3 -3177.9077 1 3 >>> 17 2 -13465.496 0 3 -11772.1338 1 4 >>> 17 2 -13465.496 0 3 -415.1114 0 5 >>> 17 2 -3329.619 0 3 -11772.1338 1 6 >>> 17 2 -3329.619 0 3 -415.1114 0 7 >>> 12 2 -22722.174 1 5 -13311.8247 1 8 >>> 15 3 -16995.346 1 1 -9781.6373 0 1 >>> 16 3 26052.495 0 3 -61284.6900 0 2 >>> 16 3 -22905.288 1 3 -61284.6900 0 3 >>> 13 3 -12776.884 1 4 28080.1607 0 4 >>> 13 3 -12776.884 1 5 -18421.6171 0 5 >>> >>> #If the following commands are issued: >>> rankedF1 = h2o.rank_within_group_by(train, ["Group_by_column"], ["Column_to_arrange_by"], >>> [TRUE], sort_cols_sorted=True) >>> h2o.summary(rankedF1) >>> >>> # The returned frame will be sorted according to sort_cols and hence look like this instead: >>> ID Group_by_column num fdata Column_to_arrange_by num_1 fdata.1 New_Rank_column >>> 15 3 -16995.346 1 1 -9781.6373 0 1 >>> 17 2 -13465.496 0 2 12094.4851 1 1 >>> 17 2 -3329.619 0 2 12094.4851 1 2 >>> 12 1 2941.552 1 3 -3177.9077 1 1 >>> 12 2 -22722.174 1 3 -3177.9077 1 3 >>> 16 1 -10003.593 0 3 -61284.6900 0 2 >>> 16 3 26052.495 0 3 -61284.6900 0 2 >>> 16 3 -22905.288 1 3 -61284.6900 0 3 >>> 17 2 -13465.496 0 3 -11772.1338 1 4 >>> 17 2 -13465.496 0 3 -415.1114 0 5 >>> 17 2 -3329.619 0 3 -11772.1338 1 6 >>> 17 2 -3329.619 0 3 -415.1114 0 7 >>> 13 3 -12776.884 1 4 28080.1607 0 4 >>> 13 1 -6049.830 0 4 28080.1607 0 3 >>> 12 1 2941.552 1 5 -13311.8247 1 4 >>> 12 2 -22722.174 1 5 -13311.8247 1 8 >>> 13 3 -12776.884 1 5 -18421.6171 0 5 >>> 13 1 -6049.830 0 5 -18421.6171 0 5 """ assert_is_type(group_by_cols, str, int, [str, int]) if type(group_by_cols) != list: group_by_cols = [group_by_cols] if type(sort_cols) != list: sort_cols = [sort_cols] if type(ascending) != list: ascending = [ascending] # convert to list ascendingI=[1]*len(sort_cols) # intitalize sorting direction to ascending by default for c in sort_cols: if self.type(c) not in ["enum","time","int","real"]: raise H2OValueError("Sort by column: " + str(c) + " not of enum, time, int or real type") for c in group_by_cols: if self.type(c) not in ["enum","time","int","real"]: raise H2OValueError("Group by column: " + str(c) + " not of enum, time, int or real type") if len(ascending)>0: # user specify sort direction, assume all columns ascending assert len(ascending)==len(sort_cols), "Sorting direction must be specified for each sorted column." for index in range(len(sort_cols)): ascendingI[index]=1 if ascending[index] else -1 finalSortedOrder=0 if (sort_cols_sorted): finalSortedOrder=1 return H2OFrame._expr(expr=ExprNode("rank_within_groupby",self,group_by_cols,sort_cols,ascendingI,new_col_name, finalSortedOrder))
[docs] def topNBottomN(self, column=0, nPercent=10, grabTopN=-1): """ Given a column name or one column index, a percent N, this function will return the top or bottom N% of the values of the column of a frame. The column must be a numerical column. :param column: a string for column name or an integer index :param nPercent: a top or bottom percentage of the column values to return :param grabTopN: -1 to grab bottom N percent and 1 to grab top N percent :returns: a H2OFrame containing two columns. The first column contains the original row indices where the top/bottom values are extracted from. The second column contains the values. """ assert (nPercent >= 0) and (nPercent<=100.0), "nPercent must be between 0.0 and 100.0" assert round(nPercent*0.01*self.nrows)>0, "Increase nPercent. Current value will result in top 0 row." if isinstance(column, int): if (column < 0) or (column>=self.ncols): raise H2OValueError("Invalid column index H2OFrame") else: colIndex = column else: # column is a column name col_names = self.names if column not in col_names: raise H2OValueError("Column name not found H2OFrame") else: colIndex = col_names.index(column) if not(self[colIndex].isnumeric()): raise H2OValueError("Wrong column type! Selected column must be numeric.") return H2OFrame._expr(expr=ExprNode("topn", self, colIndex, nPercent, grabTopN))
[docs] def topN(self, column=0, nPercent=10): """ Given a column name or one column index, a percent N, this function will return the top N% of the values of the column of a frame. The column must be a numerical column. :param column: a string for column name or an integer index :param nPercent: a top percentage of the column values to return :returns: a H2OFrame containing two columns. The first column contains the original row indices where the top values are extracted from. The second column contains the top nPercent values. """ return self.topNBottomN(column, nPercent, 1)
[docs] def bottomN(self, column=0, nPercent=10): """ Given a column name or one column index, a percent N, this function will return the bottom N% of the values of the column of a frame. The column must be a numerical column. :param column: a string for column name or an integer index :param nPercent: a bottom percentage of the column values to return :returns: a H2OFrame containing two columns. The first column contains the original row indices where the bottom values are extracted from. The second column contains the bottom nPercent values. """ return self.topNBottomN(column, nPercent, -1)
[docs] def sub(self, pattern, replacement, ignore_case=False): """ Substitute the first occurrence of pattern in a string with replacement. :param str pattern: A regular expression. :param str replacement: A replacement string. :param bool ignore_case: If True then pattern will match case-insensitively. :returns: an H2OFrame with all values matching ``pattern`` replaced with ``replacement``. """ return H2OFrame._expr(expr=ExprNode("replacefirst", self, pattern, replacement, ignore_case))
[docs] def gsub(self, pattern, replacement, ignore_case=False): """ Globally substitute occurrences of pattern in a string with replacement. :param str pattern: A regular expression. :param str replacement: A replacement string. :param bool ignore_case: If True then pattern will match case-insensitively. :returns: an H2OFrame with all occurrences of ``pattern`` in all values replaced with ``replacement``. """ return H2OFrame._expr(expr=ExprNode("replaceall", self, pattern, replacement, ignore_case))
[docs] def interaction(self, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. :param factors: list of factor columns (either indices or column names). :param bool pairwise: Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. :param int max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made). :param int min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms. :param str destination_frame: (internal) string indicating the key for the frame created. :returns: an H2OFrame """ return h2o.interaction(data=self, factors=factors, pairwise=pairwise, max_factors=max_factors, min_occurrence=min_occurrence, destination_frame=destination_frame)
[docs] def toupper(self): """ Translate characters from lower to upper case for a particular column. :returns: new H2OFrame with all strings in the current frame converted to the uppercase. """ return H2OFrame._expr(expr=ExprNode("toupper", self), cache=self._ex._cache)
[docs] def grep(self,pattern, ignore_case = False, invert = False, output_logical = False): """ Searches for matches to argument `pattern` within each element of a string column. Default behavior is to return indices of the elements matching the pattern. Parameter `output_logical` can be used to return a logical vector indicating if the element matches the pattern (1) or not (0). :param str pattern: A character string containing a regular expression. :param bool ignore_case: If True, then case is ignored during matching. :param bool invert: If True, then identify elements that do not match the pattern. :param bool output_logical: If True, then return logical vector of indicators instead of list of matching positions :return: H2OFrame holding the matching positions or a logical list if `output_logical` is enabled. """ return H2OFrame._expr(expr=ExprNode("grep", self, pattern, ignore_case, invert, output_logical))
[docs] def tolower(self): """ Translate characters from upper to lower case for a particular column. :returns: new H2OFrame with all strings in the current frame converted to the lowercase. """ return H2OFrame._expr(expr=ExprNode("tolower", self), cache=self._ex._cache)
[docs] def rep_len(self, length_out): """ Create a new frame replicating the current frame. If the source frame has a single column, then the new frame will be replicating rows and its dimensions will be ``length_out x 1``. However if the source frame has more than 1 column, then then new frame will be replicating data in columnwise direction, and its dimensions will be ``nrows x length_out``, where ``nrows`` is the number of rows in the source frame. Also note that if ``length_out`` is smaller than the corresponding dimension of the source frame, then the new frame will actually be a truncated version of the original. :param int length_out: Number of columns (rows) of the resulting H2OFrame :returns: new H2OFrame with repeated data from the current frame. """ return H2OFrame._expr(expr=ExprNode("rep_len", self, length_out))
[docs] def scale(self, center=True, scale=True): """ Center and/or scale the columns of the current frame. :param center: If True, then demean the data. If False, no shifting is done. If ``center`` is a list of numbers then shift each column by the corresponding amount. :param scale: If True, then scale the data by each column's standard deviation. If False, no scaling is done. If ``scale`` is a list of numbers, then scale each column by the requested amount. :returns: an H2OFrame with scaled values from the current frame. """ return H2OFrame._expr(expr=ExprNode("scale", self, center, scale), cache=self._ex._cache)
[docs] def signif(self, digits=6): """ Round doubles/floats to the given number of significant digits. :param int digits: Number of significant digits to retain. :returns: new H2OFrame with rounded values from the original frame. """ return H2OFrame._expr(expr=ExprNode("signif", self, digits), cache=self._ex._cache)
[docs] def round(self, digits=0): """ Round doubles/floats to the given number of decimal places. :param int digits: The number of decimal places to retain. Rounding to a negative number of decimal places is not supported. For rounding we use the "round half to even" mode (IEC 60559 standard), so that ``round(2.5) = 2`` and ``round(3.5) = 4``. :returns: new H2OFrame with rounded values from the original frame. """ return H2OFrame._expr(expr=ExprNode("round", self, digits), cache=self._ex._cache)
[docs] def asnumeric(self): """Return new frame with all columns converted to numeric.""" fr = H2OFrame._expr(expr=ExprNode("as.numeric", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "real" for k in fr._ex._cache.types.keys()} return fr
[docs] def ascharacter(self): """ Convert all columns in the frame into strings. :returns: new H2OFrame with columns of "string" type. """ fr = H2OFrame._expr(expr=ExprNode("as.character", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "string" for k in fr._ex._cache.types.keys()} return fr
[docs] def na_omit(self): """ Remove rows with NAs from the H2OFrame. :returns: new H2OFrame with all rows from the original frame containing any NAs removed. """ fr = H2OFrame._expr(expr=ExprNode("na.omit", self), cache=self._ex._cache) fr._ex._cache.nrows = -1 return fr
[docs] def difflag1(self): """ Conduct a diff-1 transform on a numeric frame column. :returns: an H2OFrame where each element is equal to the corresponding element in the source frame minus the previous-row element in the same frame. """ if self.ncols > 1: raise H2OValueError("Only single-column frames supported") if self.types[self.columns[0]] not in {"real", "int", "bool"}: raise H2OValueError("Numeric column expected") fr = H2OFrame._expr(expr=ExprNode("difflag1", self), cache=self._ex._cache) return fr
[docs] def isna(self): """ For each element in an H2OFrame, determine if it is NA or not. :returns: an H2OFrame of 1s and 0s, where 1s mean the values were NAs. """ fr = H2OFrame._expr(expr=ExprNode("is.na", self)) fr._ex._cache.nrows = self._ex._cache.nrows fr._ex._cache.ncols = self._ex._cache.ncols if self._ex._cache.names: fr._ex._cache.names = ["isNA(%s)" % n for n in self._ex._cache.names] fr._ex._cache.types = {"isNA(%s)" % n: "int" for n in self._ex._cache.names} return fr
[docs] def year(self): """ Extract the "year" part from a date column. :returns: a single-column H2OFrame containing the "year" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("year", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def month(self): """ Extract the "month" part from a date column. :returns: a single-column H2OFrame containing the "month" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("month", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def week(self): """ Extract the "week" part from a date column. :returns: a single-column H2OFrame containing the "week" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("week", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def day(self): """ Extract the "day" part from a date column. :returns: a single-column H2OFrame containing the "day" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("day", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def dayOfWeek(self): """ Extract the "day-of-week" part from a date column. :returns: a single-column H2OFrame containing the "day-of-week" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("dayOfWeek", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def hour(self): """ Extract the "hour-of-day" part from a date column. :returns: a single-column H2OFrame containing the "hour-of-day" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("hour", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def minute(self): """ Extract the "minute" part from a date column. :returns: a single-column H2OFrame containing the "minute" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("minute", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def second(self): """ Extract the "second" part from a date column. :returns: a single-column H2OFrame containing the "second" part from the source frame. """ fr = H2OFrame._expr(expr=ExprNode("second", self), cache=self._ex._cache) if fr._ex._cache.types_valid(): fr._ex._cache.types = {k: "int" for k in self._ex._cache.types.keys()} return fr
[docs] def runif(self, seed=None): """ Generate a column of random numbers drawn from a uniform distribution [0,1) and having the same data layout as the source frame. :param int seed: seed for the random number generator. :returns: Single-column H2OFrame filled with doubles sampled uniformly from [0,1). """ fr = H2OFrame._expr(expr=ExprNode("h2o.runif", self, -1 if seed is None else seed)) fr._ex._cache.ncols = 1 fr._ex._cache.nrows = self.nrow return fr
[docs] def stratified_split(self, test_frac=0.2, seed=-1): """ Construct a column that can be used to perform a random stratified split. :param float test_frac: The fraction of rows that will belong to the "test". :param int seed: The seed for the random number generator. :returns: an H2OFrame having single categorical column with two levels: ``"train"`` and ``"test"``. :examples: >>> stratsplit = df["y"].stratified_split(test_frac=0.3, seed=12349453) >>> train = df[stratsplit=="train"] >>> test = df[stratsplit=="test"] >>> >>> # check that the distributions among the initial frame, and the >>> # train/test frames match >>> df["y"].table()["Count"] / df["y"].table()["Count"].sum() >>> train["y"].table()["Count"] / train["y"].table()["Count"].sum() >>> test["y"].table()["Count"] / test["y"].table()["Count"].sum() """ return H2OFrame._expr(expr=ExprNode('h2o.random_stratified_split', self, test_frac, seed))
[docs] def match(self, table, nomatch=0): """ Make a vector of the positions of (first) matches of its first argument in its second. Only applicable to single-column categorical/string frames. :param List table: the list of items to match against :param int nomatch: value that should be returned when there is no match. :returns: a new H2OFrame containing for each cell from the source frame the index where the pattern ``table`` first occurs within that cell. """ return H2OFrame._expr(expr=ExprNode("match", self, table, nomatch, None))
[docs] def cut(self, breaks, labels=None, include_lowest=False, right=True, dig_lab=3): """ Cut a numeric vector into categorical "buckets". This method is only applicable to a single-column numeric frame. :param List[float] breaks: The cut points in the numeric vector. :param List[str] labels: Labels for categorical levels produced. Defaults to set notation of intervals defined by the breaks. :param bool include_lowest: By default, cuts are defined as intervals ``(lo, hi]``. If this parameter is True, then the interval becomes ``[lo, hi]``. :param bool right: Include the high value: ``(lo, hi]``. If False, get ``(lo, hi)``. :param int dig_lab: Number of digits following the decimal point to consider. :returns: Single-column H2OFrame of categorical data. """ assert_is_type(breaks, [numeric]) if self.ncols != 1: raise H2OValueError("Single-column frame is expected") if self.types[self.names[0]] not in {"int", "real"}: raise H2OValueError("A numeric column is expected") fr = H2OFrame._expr(expr=ExprNode("cut", self, breaks, labels, include_lowest, right, dig_lab), cache=self._ex._cache) fr._ex._cache.types = {k: "enum" for k in self.names} return fr
[docs] def which(self): """ Compose the list of row indices for which the frame contains non-zero values. Only applicable to integer single-column frames. Equivalent to comprehension ``[index for index, value in enumerate(self) if value]``. :returns: a new single-column H2OFrame containing indices of those rows in the original frame that contained non-zero values. """ return H2OFrame._expr(expr=ExprNode("which", self))
[docs] def idxmax(self,skipna=True, axis=0): """ Get the index of the max value in a column or row :param bool skipna: If True (default), then NAs are ignored during the search. Otherwise presence of NAs renders the entire result NA. :param int axis: Direction of finding the max index. If 0 (default), then the max index is searched columnwise, and the result is a frame with 1 row and number of columns as in the original frame. If 1, then the max index is searched rowwise and the result is a frame with 1 column, and number of rows equal to the number of rows in the original frame. :returns: either a list of max index values per-column or an H2OFrame containing max index values per-row from the original frame. """ return H2OFrame._expr(expr=ExprNode("which.max", self, skipna, axis))
[docs] def idxmin(self,skipna=True, axis=0): """ Get the index of the min value in a column or row :param bool skipna: If True (default), then NAs are ignored during the search. Otherwise presence of NAs renders the entire result NA. :param int axis: Direction of finding the min index. If 0 (default), then the min index is searched columnwise, and the result is a frame with 1 row and number of columns as in the original frame. If 1, then the min index is searched rowwise and the result is a frame with 1 column, and number of rows equal to the number of rows in the original frame. :returns: either a list of min index values per-column or an H2OFrame containing min index values per-row from the original frame. """ return H2OFrame._expr(expr=ExprNode("which.min", self, skipna, axis))
[docs] def ifelse(self, yes, no): """ Equivalent to ``[y if t else n for t,y,n in zip(self,yes,no)]``. Based on the booleans in the test vector, the output has the values of the yes and no vectors interleaved (or merged together). All Frames must have the same row count. Single column frames are broadened to match wider Frames. Scalars are allowed, and are also broadened to match wider frames. :param yes: Frame to use if ``test`` is true; may be a scalar or single column :param no: Frame to use if ``test`` is false; may be a scalar or single column :returns: an H2OFrame of the merged yes/no frames/scalars according to the test input frame. """ return H2OFrame._expr(expr=ExprNode("ifelse", self, yes, no))
[docs] def apply(self, fun=None, axis=0): """ Apply a lambda expression to an H2OFrame. :param fun: a lambda expression to be applied per row or per column. :param axis: 0 = apply to each column; 1 = apply to each row :returns: a new H2OFrame with the results of applying ``fun`` to the current frame. """ from .astfun import lambda_to_expr assert_is_type(axis, 0, 1) assert_is_type(fun, FunctionType) assert_satisfies(fun, fun.__name__ == "<lambda>") res = lambda_to_expr(fun) return H2OFrame._expr(expr=ExprNode("apply", self, 1 + (axis == 0), *res))
#------------------------------------------------------------------------------------------------------------------- # Synonyms + Deprecated #------------------------------------------------------------------------------------------------------------------- # Here we have all methods that are provided as alternative names to some other names defined above. This also # includes methods that we rename as part of the deprecation process (but keeping the old name for the sake of # backward compatibility). We gather them all down here to have a slightly cleaner code.
[docs] @staticmethod def mktime(year=1970, month=0, day=0, hour=0, minute=0, second=0, msec=0): """ Deprecated, use :func:`moment` instead. This function was left for backward-compatibility purposes only. It is not very stable, and counterintuitively uses 0-based months and days, so "January 4th, 2001" should be entered as ``mktime(2001, 0, 3)``. """ return H2OFrame._expr(ExprNode("mktime", year, month, day, hour, minute, second, msec))
@property def columns(self): """Same as ``self.names``.""" return self.names @columns.setter def columns(self, value): self.set_names(value) @property def col_names(self): """Same as ``self.names``.""" return self.names @col_names.setter def col_names(self, value): self.set_names(value) def __len__(self): """Number of rows in the dataframe, same as ``self.nrows``.""" return self.nrows @property def nrow(self): """Same as ``self.nrows``.""" return self.nrows @property def ncol(self): """Same as ``self.ncols``.""" return self.ncols @property def dim(self): """Same as ``list(self.shape)``.""" return [self.nrow, self.ncol] #@property #def frame_id(self): # """Same as ``frame.id``.""" # return self.id #@frame_id.setter #def frame_id(self, value): # self.id = value
[docs] @staticmethod def from_python(python_obj, destination_frame=None, header=0, separator=",", column_names=None, column_types=None, na_strings=None): """[DEPRECATED] Use constructor ``H2OFrame()`` instead.""" return H2OFrame(python_obj, destination_frame, header, separator, column_names, column_types, na_strings)
[docs] def ischaracter(self): """[DEPRECATED] Use ``frame.isstring()``.""" return self.isstring()
#----------------------------------------------------------------------------------------------------------------------- # Helpers #----------------------------------------------------------------------------------------------------------------------- def _getValidCols(by_idx, fr): # so user can input names of the columns as well is idx num tmp = [] for i in by_idx: if type(i) == str: if i not in fr.names: raise H2OValueError("Column: " + i + " not in frame.") tmp.append(fr.names.index(i)) elif type(i) != int: raise H2OValueError("Join on column: " + i + " not of type int") else: tmp.append(i) return list(set(tmp)) def _binop(lhs, op, rhs, rtype=None): assert_is_type(lhs, str, numeric, datetime.date, pandas_timestamp, numpy_datetime, H2OFrame) assert_is_type(rhs, str, numeric, datetime.date, pandas_timestamp, numpy_datetime, H2OFrame) if isinstance(lhs, H2OFrame) and isinstance(rhs, H2OFrame) and lhs._is_frame and rhs._is_frame: lrows, lcols = lhs.shape rrows, rcols = rhs.shape compatible = ((lcols == rcols and lrows == rrows) or (lcols == 1 and lrows == rrows) or (lcols == 1 and lrows == 1) or (rcols == 1 and lrows == rrows) or (rcols == 1 and rrows == 1) or (lrows == 1 and lcols == rcols) or (rrows == 1 and lcols == rcols) ) if not compatible: raise H2OValueError("Attempting to operate on incompatible frames: (%d x %d) and (%d x %d)" % (lrows, lcols, rrows, rcols)) if is_type(lhs, pandas_timestamp, numpy_datetime, datetime.date): lhs = H2OFrame.moment(date=lhs) if is_type(rhs, pandas_timestamp, numpy_datetime, datetime.date): rhs = H2OFrame.moment(date=rhs) cache = lhs._ex._cache if isinstance(lhs, H2OFrame) else rhs._ex._cache res = H2OFrame._expr(expr=ExprNode(op, lhs, rhs), cache=cache) if rtype is not None and res._ex._cache._names is not None: res._ex._cache._types = {name: rtype for name in res._ex._cache._names} return res def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL): """ For an H2O Enum column, we perform one-hot-encoding here and add one more column, "missing(NA)" to it. :param pandaFtrain: panda frame derived from H2OFrame :param cname: column name of enum col :param nrows: number of rows of enum col :return: panda frame with enum col encoded correctly for native XGBoost """ import numpy as np import pandas as pd cmissingNames=[cname+".missing(NA)"] tempnp = np.zeros((nrows,1), dtype=np.int) # check for nan and assign it correct value colVals = pandaFtrain[cname] for ind in range(nrows): try: if not(colVals[ind] in domainL): tempnp[ind]=1 except ValueError: pass zeroFrame = pd.DataFrame(tempnp) zeroFrame.columns=cmissingNames temp = pd.get_dummies(pandaFtrain[cname], prefix=cname, drop_first=False) tempNames = list(temp) # get column names colLength = len(tempNames) newNames = ['a']*colLength for ind in range(0,colLength): newNames[ind]=cname+"_"+domainL[ind] ftemp = temp[newNames] ctemp = pd.concat([ftemp, zeroFrame], axis=1) return ctemp