Source code for h2o.estimators.word2vec

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OWord2vecEstimator(H2OEstimator): """ Word2Vec """ algo = "word2vec" supervised_learning = False _options_ = {'requires_training_frame': False} def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] min_word_freq=5, # type: int word_model="skip_gram", # type: Literal["skip_gram", "cbow"] norm_model="hsm", # type: Literal["hsm"] vec_size=100, # type: int window_size=5, # type: int sent_sample_rate=0.001, # type: float init_learning_rate=0.025, # type: float epochs=5, # type: int pre_trained=None, # type: Optional[Union[None, str, H2OFrame]] max_runtime_secs=0.0, # type: float export_checkpoints_dir=None, # type: Optional[str] ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param min_word_freq: This will discard words that appear less than <int> times Defaults to ``5``. :type min_word_freq: int :param word_model: The word model to use (SkipGram or CBOW) Defaults to ``"skip_gram"``. :type word_model: Literal["skip_gram", "cbow"] :param norm_model: Use Hierarchical Softmax Defaults to ``"hsm"``. :type norm_model: Literal["hsm"] :param vec_size: Set size of word vectors Defaults to ``100``. :type vec_size: int :param window_size: Set max skip length between words Defaults to ``5``. :type window_size: int :param sent_sample_rate: Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; useful range is (0, 1e-5) Defaults to ``0.001``. :type sent_sample_rate: float :param init_learning_rate: Set the starting learning rate Defaults to ``0.025``. :type init_learning_rate: float :param epochs: Number of training iterations to run Defaults to ``5``. :type epochs: int :param pre_trained: Id of a data frame that contains a pre-trained (external) word2vec model Defaults to ``None``. :type pre_trained: Union[None, str, H2OFrame], optional :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param export_checkpoints_dir: Automatically export generated models to this directory. Defaults to ``None``. :type export_checkpoints_dir: str, optional """ super(H2OWord2vecEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.min_word_freq = min_word_freq self.word_model = word_model self.norm_model = norm_model self.vec_size = vec_size self.window_size = window_size self.sent_sample_rate = sent_sample_rate self.init_learning_rate = init_learning_rate self.epochs = epochs self.pre_trained = pre_trained self.max_runtime_secs = max_runtime_secs self.export_checkpoints_dir = export_checkpoints_dir @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator() >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("tutor", 3) >>> print(synonyms) """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def min_word_freq(self): """ This will discard words that appear less than <int> times Type: ``int``, defaults to ``5``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=1, min_word_freq=4) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("teacher", 3) >>> print(synonyms) """ return self._parms.get("min_word_freq") @min_word_freq.setter def min_word_freq(self, min_word_freq): assert_is_type(min_word_freq, None, int) self._parms["min_word_freq"] = min_word_freq @property def word_model(self): """ The word model to use (SkipGram or CBOW) Type: ``Literal["skip_gram", "cbow"]``, defaults to ``"skip_gram"``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=3, word_model="skip_gram") >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("assistant", 3) >>> print(synonyms) """ return self._parms.get("word_model") @word_model.setter def word_model(self, word_model): assert_is_type(word_model, None, Enum("skip_gram", "cbow")) self._parms["word_model"] = word_model @property def norm_model(self): """ Use Hierarchical Softmax Type: ``Literal["hsm"]``, defaults to ``"hsm"``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=1, norm_model="hsm") >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("teacher", 3) >>> print(synonyms) """ return self._parms.get("norm_model") @norm_model.setter def norm_model(self, norm_model): assert_is_type(norm_model, None, Enum("hsm")) self._parms["norm_model"] = norm_model @property def vec_size(self): """ Set size of word vectors Type: ``int``, defaults to ``100``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=3, vec_size=50) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("tutor", 3) >>> print(synonyms) """ return self._parms.get("vec_size") @vec_size.setter def vec_size(self, vec_size): assert_is_type(vec_size, None, int) self._parms["vec_size"] = vec_size @property def window_size(self): """ Set max skip length between words Type: ``int``, defaults to ``5``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=3, window_size=2) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("teacher", 3) >>> print(synonyms) """ return self._parms.get("window_size") @window_size.setter def window_size(self, window_size): assert_is_type(window_size, None, int) self._parms["window_size"] = window_size @property def sent_sample_rate(self): """ Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; useful range is (0, 1e-5) Type: ``float``, defaults to ``0.001``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=1, sent_sample_rate=0.01) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("teacher", 3) >>> print(synonyms) """ return self._parms.get("sent_sample_rate") @sent_sample_rate.setter def sent_sample_rate(self, sent_sample_rate): assert_is_type(sent_sample_rate, None, float) self._parms["sent_sample_rate"] = sent_sample_rate @property def init_learning_rate(self): """ Set the starting learning rate Type: ``float``, defaults to ``0.025``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=3, init_learning_rate=0.05) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("assistant", 3) >>> print(synonyms) """ return self._parms.get("init_learning_rate") @init_learning_rate.setter def init_learning_rate(self, init_learning_rate): assert_is_type(init_learning_rate, None, float) self._parms["init_learning_rate"] = init_learning_rate @property def epochs(self): """ Number of training iterations to run Type: ``int``, defaults to ``5``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("teacher", count = 5) >>> print(synonyms) >>> >>> w2v_model2 = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 1) >>> w2v_model2.train(training_frame=words) >>> synonyms2 = w2v_model2.find_synonyms("teacher", 3) >>> print(synonyms2) """ return self._parms.get("epochs") @epochs.setter def epochs(self, epochs): assert_is_type(epochs, None, int) self._parms["epochs"] = epochs @property def pre_trained(self): """ Id of a data frame that contains a pre-trained (external) word2vec model Type: ``Union[None, str, H2OFrame]``. :examples: >>> words = h2o.create_frame(rows=1000,cols=1, ... string_fraction=1.0, ... missing_fraction=0.0) >>> embeddings = h2o.create_frame(rows=1000,cols=100, ... real_fraction=1.0, ... missing_fraction=0.0) >>> word_embeddings = words.cbind(embeddings) >>> w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings) >>> w2v_model.train(training_frame=word_embeddings) >>> model_id = w2v_model.model_id >>> model = h2o.get_model(model_id) """ return self._parms.get("pre_trained") @pre_trained.setter def pre_trained(self, pre_trained): pt = self._parms["pre_trained"] = H2OFrame._validate(pre_trained, 'pre_trained') if pt is not None: self.vec_size = H2OWord2vecEstimator._determine_vec_size(pt) @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=1, max_runtime_secs=10) >>> w2v_model.train(training_frame=words) >>> synonyms = w2v_model.find_synonyms("tutor", 3) >>> print(synonyms) """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> import tempfile >>> from os import listdir >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> checkpoints_dir = tempfile.mkdtemp() >>> words = job_titles.tokenize(" ") >>> w2v_model = H2OWord2vecEstimator(epochs=1, ... max_runtime_secs=10, ... export_checkpoints_dir=checkpoints_dir) >>> w2v_model.train(training_frame=words) >>> len(listdir(checkpoints_dir)) """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir
[docs] @staticmethod def from_external(external=H2OFrame): """ Creates new H2OWord2vecEstimator based on an external model. :param external: H2OFrame with an external model :return: H2OWord2vecEstimator instance representing the external model :examples: >>> words = h2o.create_frame(rows=10, cols=1, ... string_fraction=1.0, ... missing_fraction=0.0) >>> embeddings = h2o.create_frame(rows=10, cols=100, ... real_fraction=1.0, ... missing_fraction=0.0) >>> word_embeddings = words.cbind(embeddings) >>> w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings) """ w2v_model = H2OWord2vecEstimator(pre_trained=external) w2v_model.train() return w2v_model
@staticmethod def _determine_vec_size(pre_trained): """ Determines vec_size for a pre-trained model after basic model verification. """ first_column = pre_trained.types[pre_trained.columns[0]] if first_column != 'string': raise H2OValueError("First column of given pre_trained model %s is required to be a String", pre_trained.frame_id) if list(pre_trained.types.values()).count('string') > 1: raise H2OValueError("There are multiple columns in given pre_trained model %s with a String type.", pre_trained.frame_id) return pre_trained.dim[1] - 1