diff --git a/MANIFEST.in b/MANIFEST.in index e69de29bb..5576404e2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include easy_rec/python/ops/1.12/*.so* +include easy_rec/python/ops/1.15/*.so* diff --git a/easy_rec/__init__.py b/easy_rec/__init__.py index cfafba708..f3c7afd0b 100644 --- a/easy_rec/__init__.py +++ b/easy_rec/__init__.py @@ -15,6 +15,16 @@ logging.basicConfig( level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s') +ops_dir = os.path.join(curr_dir, 'python/ops') +if 'PAI' in tf.__version__: + ops_dir = os.path.join(ops_dir, '1.12_pai') +elif tf.__version__.startswith('1.12'): + ops_dir = os.path.join(ops_dir, '1.12') +elif tf.__version__.startswith('1.15'): + ops_dir = os.path.join(ops_dir, '1.15') +else: + ops_dir = None + from easy_rec.python.inference.predictor import Predictor # isort:skip # noqa: E402 from easy_rec.python.main import evaluate # isort:skip # noqa: E402 from easy_rec.python.main import distribute_evaluate # isort:skip # noqa: E402 @@ -32,12 +42,6 @@ _global_config = {} -ops_dir = os.path.join(curr_dir, 'python/ops') -if tf.__version__.startswith('1.12'): - ops_dir = os.path.join(ops_dir, '1.12') -elif tf.__version__.startswith('1.15'): - ops_dir = os.path.join(ops_dir, '1.15') - def help(): print(""" diff --git a/easy_rec/python/compat/optimizers.py b/easy_rec/python/compat/optimizers.py index 21fede4b8..37969cb1e 100644 --- a/easy_rec/python/compat/optimizers.py +++ b/easy_rec/python/compat/optimizers.py @@ -35,6 +35,9 @@ from tensorflow.python.training import moving_averages from tensorflow.python.training import optimizer as optimizer_ from tensorflow.python.training import training as train +from easy_rec.python.ops.incr_record import set_sparse_indices +import tensorflow as tf +import logging OPTIMIZER_CLS_NAMES = { 'Adagrad': @@ -75,7 +78,8 @@ def optimize_loss(loss, summaries=None, colocate_gradients_with_ops=False, not_apply_grad_after_first_step=False, - increment_global_step=True): + increment_global_step=True, + incr_save=False): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: @@ -146,6 +150,7 @@ def optimize_loss(loss, calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. + incr_save: increment dump checkpoints. Returns: Training op. @@ -300,11 +305,23 @@ def optimize_loss(loss, # Create gradient updates. def _apply_grad(): + incr_save_ops = [] + if incr_save: + for grad, var in gradients: + if isinstance(grad, ops.IndexedSlices): + with ops.colocate_with(var): + incr_save_op = set_sparse_indices(grad.indices, var_name=var.op.name) + incr_save_ops.append(incr_save_op) + ops.add_to_collection('SPARSE_UPDATE_VARIABLES', (var, grad.indices.dtype)) + else: + ops.add_to_collection('DENSE_UPDATE_VARIABLES', var) + grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name='train') - return control_flow_ops.with_dependencies([grad_updates], loss) + + return control_flow_ops.with_dependencies([grad_updates] + incr_save_ops, loss) if not_apply_grad_after_first_step: train_tensor = control_flow_ops.cond(global_step > 0, lambda: loss, diff --git a/easy_rec/python/input/datahub_input.py b/easy_rec/python/input/datahub_input.py index 8e86feab7..d5ef8b90d 100644 --- a/easy_rec/python/input/datahub_input.py +++ b/easy_rec/python/input/datahub_input.py @@ -4,7 +4,9 @@ import time import numpy as np +import json import tensorflow as tf +import traceback from easy_rec.python.input.input import Input from easy_rec.python.utils import odps_util @@ -18,14 +20,18 @@ from datahub.exceptions import DatahubException from datahub.models import RecordType from datahub.models import CursorType -except Exception: + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + logging.getLogger('datahub.account').setLevel(logging.INFO) +except Exception as ex: + # logging.warning(traceback.format_exc(ex)) logging.warning( 'DataHub is not installed. You can install it by: pip install pydatahub') DataHub = None - class DataHubInput(Input): - """Common IO based interface, could run at local or on data science.""" + """DataHubInput is used for online train.""" + def __init__(self, data_config, @@ -35,27 +41,59 @@ def __init__(self, task_num=1): super(DataHubInput, self).__init__(data_config, feature_config, '', task_index, task_num) - if DataHub is None: - logging.error('please install datahub: ', - 'pip install pydatahub ;Python 3.6 recommended') + try: - self._datahub_config = datahub_config - if self._datahub_config is None: - pass - self._datahub = DataHub(self._datahub_config.akId, - self._datahub_config.akSecret, - self._datahub_config.region) self._num_epoch = 0 + self._datahub_config = datahub_config + if self._datahub_config is not None: + akId = self._datahub_config.akId + akSecret = self._datahub_config.akSecret + region = self._datahub_config.region + if not isinstance(akId, str): + akId = akId.encode('utf-8') + akSecret = akSecret.encode('utf-8') + region = region.encode('utf-8') + self._datahub = DataHub(akId, akSecret, region) + else: + self._datahub = None except Exception as ex: - logging.info('exception in init datahub:', str(ex)) + logging.info('exception in init datahub: %s' % str(ex)) pass + self._offset_dict = {} + if datahub_config: + if self._datahub_config.offset_info: + self._offset_dict = json.loads(self._datahub_config.offset_info) + shard_result = self._datahub.list_shard(self._datahub_config.project, + self._datahub_config.topic) + shards = shard_result.shards + self._shards = [shards[i] for i in range(len(shards)) if (i % task_num) == task_index] + logging.info('all shards: %s' % str(self._shards)) + offset_dict = {} + for x in self._shards: + if x.shard_id in self._offset_dict: + offset_dict[x.shard_id] = self._offset_dict[x.shard_id] + self._offset_dict = offset_dict def _parse_record(self, *fields): fields = list(fields) - inputs = {self._input_fields[x]: fields[x] for x in self._effective_fids} + field_dict = {self._input_fields[x]: fields[x] for x in self._effective_fids} for x in self._label_fids: - inputs[self._input_fields[x]] = fields[x] - return inputs + field_dict[self._input_fields[x]] = fields[x] + field_dict[Input.DATA_OFFSET] = fields[-1] + return field_dict + + def _preprocess(self, field_dict): + output_dict = super(DataHubInput, self)._preprocess(field_dict) + + # append offset fields + if Input.DATA_OFFSET in field_dict: + output_dict[Input.DATA_OFFSET] = field_dict[Input.DATA_OFFSET] + + # for _get_features to include DATA_OFFSET + if Input.DATA_OFFSET not in self._appended_fields: + self._appended_fields.append(Input.DATA_OFFSET) + + return output_dict def _datahub_generator(self): logging.info('start epoch[%d]' % self._num_epoch) @@ -65,62 +103,87 @@ def _datahub_generator(self): self.get_type_defaults(x, v) for x, v in zip(self._input_field_types, self._input_field_defaults) ] - batch_defaults = [ - np.array([x] * self._data_config.batch_size) for x in record_defaults + batch_data = [ + np.asarray([x] * self._data_config.batch_size, order='C', dtype=object) + if isinstance(x, str) else + np.array([x] * self._data_config.batch_size) + for x in record_defaults ] + batch_data.append(json.dumps(self._offset_dict)) + try: self._datahub.wait_shards_ready(self._datahub_config.project, self._datahub_config.topic) topic_result = self._datahub.get_topic(self._datahub_config.project, self._datahub_config.topic) if topic_result.record_type != RecordType.TUPLE: - logging.error('topic type illegal !') + logging.error('datahub topic type(%s) illegal' % str(topic_result.record_type)) record_schema = topic_result.record_schema - shard_result = self._datahub.list_shard(self._datahub_config.project, - self._datahub_config.topic) - shards = shard_result.shards - for shard in shards: - shard_id = shard._shard_id - cursor_result = self._datahub.get_cursor(self._datahub_config.project, + + batch_size = self._data_config.batch_size + + tid = 0 + while True: + shard_id = self._shards[tid].shard_id + tid += 1 + if tid >= len(self._shards): + tid = 0 + if shard_id not in self._offset_dict: + cursor_result = self._datahub.get_cursor(self._datahub_config.project, self._datahub_config.topic, shard_id, CursorType.OLDEST) - cursor = cursor_result.cursor - limit = self._data_config.batch_size - while True: - get_result = self._datahub.get_tuple_records( - self._datahub_config.project, self._datahub_config.topic, - shard_id, record_schema, cursor, limit) - batch_data_np = [x.copy() for x in batch_defaults] - for row_id, record in enumerate(get_result.records): - for col_id in range(len(record_defaults)): - if record.values[col_id] not in ['', 'Null', None]: - batch_data_np[col_id][row_id] = record.values[col_id] - yield tuple(batch_data_np) - if 0 == get_result.record_count: - time.sleep(1) - cursor = get_result.next_cursor - except DatahubException as e: - logging.error(e) + cursor = cursor_result.cursor + else: + cursor = self._offset_dict[shard_id]['cursor'] + + get_result = self._datahub.get_tuple_records( + self._datahub_config.project, self._datahub_config.topic, + shard_id, record_schema, cursor, batch_size) + count = get_result.record_count + if count == 0: + continue + time_offset = 0 + sequence_offset = 0 + for row_id, record in enumerate(get_result.records): + if record.system_time > time_offset: + time_offset = record.system_time + if record.sequence > sequence_offset: + sequence_offset = record.sequence + for col_id in range(len(record_defaults)): + if record.values[col_id] not in ['', 'Null', 'null', 'NULL', None]: + batch_data[col_id][row_id] = record.values[col_id] + else: + batch_data[col_id][row_id] = record_defaults[col_id] + cursor = get_result.next_cursor + self._offset_dict[shard_id] = {'sequence_offset': sequence_offset, + 'time_offset': time_offset, + 'cursor': cursor + } + batch_data[-1] = json.dumps(self._offset_dict) + yield tuple(batch_data) + except DatahubException as ex: + logging.error('DatahubException: %s' % str(ex)) def _build(self, mode, params): - # get input type - list_type = [self.get_tf_type(x) for x in self._input_field_types] - list_type = tuple(list_type) - list_shapes = [tf.TensorShape([None]) for x in range(0, len(list_type))] + # get input types + list_types = [self.get_tf_type(x) for x in self._input_field_types] + list_types.append(tf.string) + list_types = tuple(list_types) + list_shapes = [tf.TensorShape([None]) for x in range(0, len(self._input_field_types))] + list_shapes.append(tf.TensorShape([])) list_shapes = tuple(list_shapes) # read datahub dataset = tf.data.Dataset.from_generator( self._datahub_generator, - output_types=list_type, + output_types=list_types, output_shapes=list_shapes) if mode == tf.estimator.ModeKeys.TRAIN: - dataset = dataset.shuffle( - self._data_config.shuffle_buffer_size, - seed=2020, - reshuffle_each_iteration=True) - dataset = dataset.repeat(self.num_epochs) - else: - dataset = dataset.repeat(1) + if self._data_config.shuffle: + dataset = dataset.shuffle( + self._data_config.shuffle_buffer_size, + seed=2020, + reshuffle_each_iteration=True) + dataset = dataset.map( self._parse_record, num_parallel_calls=self._data_config.num_parallel_calls) diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index c0d8653bf..9bb20e599 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -23,6 +23,8 @@ class Input(six.with_metaclass(_meta_type, object)): + DATA_OFFSET = 'DATA_OFFSET' + def __init__(self, data_config, feature_configs, diff --git a/easy_rec/python/input/kafka_dataset.py b/easy_rec/python/input/kafka_dataset.py new file mode 100644 index 000000000..5e2ba3a6c --- /dev/null +++ b/easy_rec/python/input/kafka_dataset.py @@ -0,0 +1,144 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Kafka Dataset.""" + +import logging + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +import traceback + +try: + from easy_rec.python.ops import gen_kafka_ops +except ImportError as ex: + logging.warning('failed to import gen_kafka_ops: %s' % traceback.format_exc(ex)) + + +class KafkaDataset(dataset_ops.Dataset): + """A Kafka Dataset that consumes the message.""" + + def __init__(self, + topics, + servers='localhost', + group='', + eof=False, + timeout=1000, + config_global=None, + config_topic=None, + message_key=False, + message_offset=False): + """Create a KafkaReader. + + Args: + topics: A `tf.string` tensor containing one or more subscriptions, + in the format of [topic:partition:offset:length], + by default length is -1 for unlimited. + servers: A list of bootstrap servers. + group: The consumer group id. + eof: If True, the kafka reader will stop on EOF. + timeout: The timeout value for the Kafka Consumer to wait + (in millisecond). + config_global: A `tf.string` tensor containing global configuration + properties in [Key=Value] format, + eg. ["enable.auto.commit=false", + "heartbeat.interval.ms=2000"], + please refer to 'Global configuration properties' + in librdkafka doc. + config_topic: A `tf.string` tensor containing topic configuration + properties in [Key=Value] format, + eg. ["auto.offset.reset=earliest"], + please refer to 'Topic configuration properties' + in librdkafka doc. + message_key: If True, the kafka will output both message value and key. + message_offset: If True, the kafka will output both message value and offset. + """ + self._topics = ops.convert_to_tensor( + topics, dtype=dtypes.string, name='topics') + self._servers = ops.convert_to_tensor( + servers, dtype=dtypes.string, name='servers') + self._group = ops.convert_to_tensor( + group, dtype=dtypes.string, name='group') + self._eof = ops.convert_to_tensor(eof, dtype=dtypes.bool, name='eof') + self._timeout = ops.convert_to_tensor( + timeout, dtype=dtypes.int64, name='timeout') + config_global = config_global if config_global else [] + self._config_global = ops.convert_to_tensor( + config_global, dtype=dtypes.string, name='config_global') + config_topic = config_topic if config_topic else [] + self._config_topic = ops.convert_to_tensor( + config_topic, dtype=dtypes.string, name='config_topic') + self._message_key = message_key + self._message_offset = message_offset + super(KafkaDataset, self).__init__() + + def _inputs(self): + return [] + + def _as_variant_tensor(self): + return gen_kafka_ops.io_kafka_dataset_v2( + self._topics, + self._servers, + self._group, + self._eof, + self._timeout, + self._config_global, + self._config_topic, + self._message_key, + self._message_offset, + ) + + @property + def output_classes(self): + if self._message_key ^ self._message_offset: + return (ops.Tensor, ops.Tensor) + elif self._message_key and self._message_offset: + return (ops.Tensor, ops.Tensor, ops.Tensor) + return (ops.Tensor) + + @property + def output_shapes(self): + if self._message_key ^ self._message_offset: + return ((tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))) + elif self._message_key and self._message_offset: + return ((tensor_shape.TensorShape([]), tensor_shape.TensorShape([]), + tensor_shape.TensorShape([]))) + return ((tensor_shape.TensorShape([]))) + + @property + def output_types(self): + if self._message_key ^ self._message_offset: + return ((dtypes.string, dtypes.string)) + elif self._message_key and self._message_offset: + return ((dtypes.string, dtypes.string, dtypes.string)) + return ((dtypes.string)) + + +def write_kafka_v2(message, topic, servers='localhost', name=None): + """Write kafka. + + Args: + message: A `Tensor` of type `string`. 0-D. + topic: A `tf.string` tensor containing one subscription, + in the format of topic:partition. + servers: A list of bootstrap servers. + name: A name for the operation (optional). + + Returns: + A `Tensor` of type `string`. 0-D. + """ + return gen_kafka_ops.io_write_kafka_v2( + message=message, topic=topic, servers=servers, name=name) diff --git a/easy_rec/python/input/kafka_input.py b/easy_rec/python/input/kafka_input.py index 63bf5a4d2..40956c425 100644 --- a/easy_rec/python/input/kafka_input.py +++ b/easy_rec/python/input/kafka_input.py @@ -2,10 +2,19 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import logging import sys +import traceback +import json +import six import tensorflow as tf from easy_rec.python.input.input import Input +from easy_rec.python.input.kafka_dataset import KafkaDataset + +try: + from kafka import KafkaConsumer +except ImportError as ex: + logging.warning('kafka-python is not installed: %s' % traceback.format_exc(ex)) if tf.__version__ >= '2.0': tf = tf.compat.v1 @@ -13,6 +22,8 @@ class KafkaInput(Input): + DATA_OFFSET = 'DATA_OFFSET' + def __init__(self, data_config, feature_config, @@ -22,92 +33,113 @@ def __init__(self, super(KafkaInput, self).__init__(data_config, feature_config, '', task_index, task_num) self._kafka = kafka_config + self._offset_dict = {} + if self._kafka is not None: + # each topic in the format: topic:partition_id:offset + self._topics = [] + if self._kafka.offset_info: + offset_dict = json.loads(self._kafka.offset_info) + for part in offset_dict: + part_id = int(part) + if (part_id % self._task_num) == self._task_index: + self._offset_dict[part_id] = offset_dict[part] + consumer = KafkaConsumer(group_id='kafka_dataset_consumer', + bootstrap_servers=[self._kafka.server]) + partitions = consumer.partitions_for_topic(self._kafka.topic) + num_partition = len(partitions) + logging.info('all partitions[%d]: %s' % (num_partition, partitions)) + for part_id in range(num_partition): + if (part_id % self._task_num) == self._task_index: + offset = self._offset_dict.get(part_id, 0) + self._topics.append('%s:%d:%d' % (self._kafka.topic, part_id, offset)) + logging.info('assigned topic partitions: %s' % (','.join(self._topics))) + assert len(self._topics) > 0, 'no partitions are assigned for this task(%d/%d)' % ( + self._task_index, self._task_num) + else: + self._topics = None + + def _preprocess(self, field_dict): + output_dict = super(KafkaInput, self)._preprocess(field_dict) + output_dict[Input.DATA_OFFSET] = field_dict[Input.DATA_OFFSET] - def _parse_csv(self, line): + if Input.DATA_OFFSET not in self._appended_fields: + self._appended_fields.append(Input.DATA_OFFSET) + return output_dict + + def _parse_csv(self, line, message_key, message_offset): record_defaults = [ self.get_type_defaults(t, v) for t, v in zip(self._input_field_types, self._input_field_defaults) ] - def _check_data(line): - sep = self._data_config.separator - if type(sep) != type(str): - sep = sep.encode('utf-8') - field_num = len(line[0].split(sep)) - assert field_num == len(record_defaults),\ - 'sep[%s] maybe invalid: field_num=%d, required_num=%d' % (sep, field_num, len(record_defaults)) - return True - - check_op = tf.py_func(_check_data, [line], Tout=tf.bool) - with tf.control_dependencies([check_op]): - fields = tf.decode_csv( - line, - field_delim=self._data_config.separator, - record_defaults=record_defaults, - name='decode_csv') + fields = tf.decode_csv( + line, + use_quote_delim=False, + field_delim=self._data_config.separator, + record_defaults=record_defaults, + name='decode_csv') inputs = {self._input_fields[x]: fields[x] for x in self._effective_fids} for x in self._label_fids: inputs[self._input_fields[x]] = fields[x] + + # record current offset + def _parse_offset(message_offset): + for kv in message_offset: + if six.PY3: + kv = kv.decode('utf-8') + k,v = kv.split(':') + v = int(v) + if k not in self._offset_dict or v > self._offset_dict[k]: + self._offset_dict[k] = v + return json.dumps(self._offset_dict) + + inputs[Input.DATA_OFFSET] = tf.py_func(_parse_offset, [message_offset], tf.string) return inputs - def _build(self, mode, params): - try: - import tensorflow_io.kafka as kafka_io - except ImportError: - logging.error( - 'Please install tensorflow-io, ' - 'version compatibility can refer to https://github.com/tensorflow/io#tensorflow-version-compatibility' - ) + def _preprocess(self, field_dict): + output_dict = super(KafkaInput, self)._preprocess(field_dict) + # append offset fields + if Input.DATA_OFFSET in field_dict: + output_dict[Input.DATA_OFFSET] = field_dict[Input.DATA_OFFSET] + + # for _get_features to include DATA_OFFSET + if Input.DATA_OFFSET not in self._appended_fields: + self._appended_fields.append(Input.DATA_OFFSET) + + return output_dict + + def _build(self, mode, params): num_parallel_calls = self._data_config.num_parallel_calls if mode == tf.estimator.ModeKeys.TRAIN: - train = self._kafka - topics = [] - i = self._task_index - assert len(train.offset) == 1 or len(train.offset) == train.partitions, \ - 'number of train.offset must be 1 or train.partitions' - while i < train.partitions: - offset_i = train.offset[i] if i < len( - train.offset) else train.offset[-1] - topics.append(train.topic + ':' + str(i) + ':' + str(offset_i) + ':-1') - i = i + self._task_num - + train_kafka = self._kafka logging.info( 'train kafka server: %s topic: %s task_num: %d task_index: %d topics: %s' % - (train.server, train.topic, self._task_num, self._task_index, topics)) - if len(topics) == 0: - logging.info('train kafka topic is empty') - sys.exit(1) - - dataset = kafka_io.KafkaDataset( - topics, servers=train.server, group=train.group, eof=False) - dataset = dataset.repeat(1) + (train_kafka.server, train_kafka.topic, self._task_num, self._task_index, self._topics)) + + dataset = KafkaDataset( + self._topics, + servers=train_kafka.server, + group=train_kafka.group, + eof=False, + config_global = list(self._kafka.config_global), + config_topic = list(self._kafka.config_topic), + message_key=True, + message_offset=True) else: - eval = self._kafka - topics = [] - i = 0 - assert len(eval.offset) == 1 or len(eval.offset) == eval.partitions, \ - 'number of eval.offset must be 1 or eval.partitions' - while i < eval.partitions: - offset_i = eval.offset[i] if i < len(eval.offset) else eval.offset[-1] - topics.append(eval.topic + ':' + str(i) + ':' + str(eval.offset) + - ':-1') - i = i + 1 - + eval_kafka = self._kafka logging.info( 'eval kafka server: %s topic: %s task_num: %d task_index: %d topics: %s' - % (eval.server, eval.topic, self._task_num, self._task_index, topics)) - - if len(topics) == 0: - logging.info('eval kafka topic is empty') - sys.exit(1) + % (eval_kafka.server, eval_kafka.topic, self._task_num, self._task_index, self._topics)) - dataset = kafka_io.KafkaDataset( - topics, servers=eval.server, group=eval.group, eof=False) - dataset = dataset.repeat(1) + dataset = KafkaDataset(self._topics, servers=self._kafka.server, + group=eval_kafka.group, eof=True, + config_global = list(self._kafka.config_global), + config_topic = list(self._kafka.config_topic), + message_key=True, message_offset=True) dataset = dataset.batch(self._data_config.batch_size) dataset = dataset.map( diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py index cbaaf5ed2..aee434fe1 100644 --- a/easy_rec/python/main.py +++ b/easy_rec/python/main.py @@ -288,8 +288,12 @@ def _train_and_evaluate_impl(pipeline_config, continue_train=False): eval_data = _get_input_object_by_name(pipeline_config, 'eval') distribution = strategy_builder.build(train_config) + params = {} + if train_config.is_profiling: + params['log_device_placement'] = True estimator, run_config = _create_estimator( - pipeline_config, distribution=distribution) + pipeline_config, distribution=distribution, + params=params) master_stat_file = os.path.join(pipeline_config.model_dir, 'master.stat') version_file = os.path.join(pipeline_config.model_dir, 'version') @@ -312,6 +316,20 @@ def _train_and_evaluate_impl(pipeline_config, continue_train=False): if data_config.input_type == data_config.InputType.OdpsRTPInputV2: input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + # support for datahub/kafka offset restore + final_ckpt = estimator_utils.latest_checkpoint(pipeline_config.model_dir) + if final_ckpt is not None: + final_offset_path = final_ckpt + '.offset' + logging.info('restore offset_info from %s' % final_offset_path) + if gfile.Exists(final_offset_path): + with gfile.GFile(final_offset_path) as fin: + offset_info = json.load(fin) + if train_data: + train_data.offset_info = json.dumps(offset_info) + if eval_data is not None: + eval_data.offset_info = json.dumps(ofset_info) + + # create train input train_input_fn = _get_input_fn(data_config, feature_configs, train_data, **input_fn_kwargs) @@ -362,10 +380,7 @@ def evaluate(pipeline_config, pipeline_config.eval_input_path = eval_data_path train_config = pipeline_config.train_config - if pipeline_config.WhichOneof('eval_path') == 'kafka_eval_input': - eval_data = pipeline_config.kafka_eval_input - else: - eval_data = pipeline_config.eval_input_path + eval_data = _get_input_object_by_name(pipeline_config, 'eval') server_target = None if 'TF_CONFIG' in os.environ: @@ -726,6 +741,13 @@ def export(export_dir, serving_input_fn = _get_input_fn(data_config, feature_configs, None, export_config, **input_fn_kwargs) if 'oss_path' in extra_params: + if pipeline_config.train_config.HasField('incr_save_config'): + incr_save_config = pipeline_config.train_config.incr_save_config + extra_params['incr_save'] = {} + if incr_save_config.HasField('kafka'): + extra_params['incr_save']['kafka'] = incr_save_config.kafka + if incr_save_config.HasField('datahub'): + extra_params['incr_save']['datahub'] = incr_save_config.datahub return export_big_model_to_oss(export_dir, pipeline_config, extra_params, serving_input_fn, estimator, checkpoint_path, verbose) diff --git a/easy_rec/python/model/easy_rec_estimator.py b/easy_rec/python/model/easy_rec_estimator.py index 7cb7d56bf..69b6a13ac 100644 --- a/easy_rec/python/model/easy_rec_estimator.py +++ b/easy_rec/python/model/easy_rec_estimator.py @@ -29,7 +29,10 @@ from easy_rec.python.protos.train_pb2 import DistributionStrategy from easy_rec.python.utils import estimator_utils from easy_rec.python.utils import pai_util +from easy_rec.python.utils import constant from easy_rec.python.utils.multi_optimizer import MultiOptimizer +from easy_rec.python.input.input import Input +from tensorflow.python.platform import gfile if tf.__version__ >= '2.0': tf = tf.compat.v1 @@ -70,6 +73,11 @@ def eval_config(self): def train_config(self): return self._pipeline_config.train_config + @property + def incr_save_config(self): + return self.train_config.incr_save_config \ + if self.train_config.HasField('incr_save_config') else None + @property def export_config(self): return self._pipeline_config.export_config @@ -106,9 +114,25 @@ def _train_model_fn(self, features, labels, run_config): for key in loss_dict: tf.summary.scalar(key, loss_dict[key], family='loss') + if Input.DATA_OFFSET in features: + task_index, task_num = estimator_utils.get_task_index_and_num() + data_offset_var = tf.get_variable(name=Input.DATA_OFFSET, dtype=tf.string, + shape=[task_num], + collections=[tf.GraphKeys.GLOBAL_VARIABLES, Input.DATA_OFFSET], + trainable=False) + update_offset = tf.assign(data_offset_var[task_index], features[Input.DATA_OFFSET]) + tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_offset) + else: + data_offset_var = None + # update op, usually used for batch-norm update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: + # register for increment update, such as batchnorm moving_mean and moving_variance + global_vars = { x.name:x for x in tf.global_variables() } + for x in update_ops: + if x.inputs[0].name in global_vars: + ops.add_to_collection(constant.DENSE_UPDATE_VARIABLES, global_vars[x.inputs[0].name]) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): loss = tf.identity(loss, name='total_loss') @@ -231,7 +255,9 @@ def _train_model_fn(self, features, labels, run_config): colocate_gradients_with_ops=True, not_apply_grad_after_first_step=run_config.is_chief and self._pipeline_config.data_config.chief_redundant, - name='') # Preventing scope prefix on all variables. + name='', # Preventing scope prefix on all variables. + incr_save=(self.incr_save_config is not None)) + # online evaluation metric_update_op_dict = None @@ -284,6 +310,7 @@ def format_fn(tensor_dict): if self.train_config.train_distribute in [ DistributionStrategy.CollectiveAllReduceStrategy, + DistributionStrategy.MirroredStrategy, DistributionStrategy.MultiWorkerMirroredStrategy ]: # for multi worker strategy, we could not replace the @@ -294,35 +321,36 @@ def format_fn(tensor_dict): var_list = ( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)) - initialize_var_list = [ - x for x in var_list if 'WorkQueue' not in str(type(x)) - ] + + # exclude data_offset_var + var_list = [ x for x in var_list if x != data_offset_var ] # early_stop flag will not be saved in checkpoint # and could not be restored from checkpoint early_stop_var = find_early_stop_var(var_list) + var_list = [x for x in var_list if x != early_stop_var] + + initialize_var_list = [ + x for x in var_list if 'WorkQueue' not in str(type(x)) + ] + # incompatiable shape restore will not be saved in checkpoint # but must be able to restore from checkpoint incompatiable_shape_restore = tf.get_collection('T_E_M_P_RESTROE') - if early_stop_var is not None: - var_list = [x for x in var_list if x != early_stop_var] - local_init_op = tf.group([ - tf.initializers.local_variables(), - tf.initializers.variables([early_stop_var] + - incompatiable_shape_restore) - ]) - elif len(incompatiable_shape_restore) > 0: - local_init_op = tf.group([ - tf.initializers.local_variables(), - tf.initializers.variables(incompatiable_shape_restore) - ]) - else: - local_init_op = None + + local_init_ops = [tf.train.Scaffold.default_local_init_op()] + if data_offset_var is not None and estimator_utils.is_chief(): + local_init_ops.append(tf.initializers.variables([data_offset_var])) + if early_stop_var is not None and estimator_utils.is_chief(): + local_init_ops.append(tf.initializers.variables([early_stop_var])) + if len(incompatiable_shape_restore) > 0: + local_init_ops.append(tf.initializers.variables(incompatiable_shape_restore)) + scaffold = tf.train.Scaffold( saver=tf.train.Saver( var_list=var_list, sharded=True, max_to_keep=self.train_config.keep_checkpoint_max), - local_init_op=local_init_op, + local_init_op=tf.group(local_init_ops), ready_for_local_init_op=tf.report_uninitialized_variables( var_list=initialize_var_list)) # saver hook @@ -331,11 +359,19 @@ def format_fn(tensor_dict): save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=scaffold, - write_graph=self.train_config.write_graph) + write_graph=self.train_config.write_graph, + data_offset_var=data_offset_var, + increment_save_config=self.incr_save_config) chief_hooks = [] if estimator_utils.is_chief(): hooks.append(saver_hook) + # oss stop signal hook + if self.train_config.enable_oss_stop_signal: + oss_stop_signal = estimator_utils.OssStopSignalHook( + model_dir=self.model_dir) + hooks.append(oss_stop_signal) + # profiling hook if self.train_config.is_profiling and estimator_utils.is_chief(): profile_hook = tf.train.ProfilerHook( @@ -461,13 +497,25 @@ def _export_model_fn(self, features, labels, run_config, params): # save train pipeline.config for debug purpose pipeline_path = os.path.join(self._model_dir, 'pipeline.config') - if tf.gfile.Exists(pipeline_path): + if gfile.Exists(pipeline_path): tf.add_to_collection( tf.GraphKeys.ASSET_FILEPATHS, tf.constant(pipeline_path, dtype=tf.string, name='pipeline.config')) else: print('train pipeline_path(%s) does not exist' % pipeline_path) + # restore DENSE_UPDATE_VARIABLES collection + dense_train_var_path = os.path.join(self.model_dir, constant.DENSE_UPDATE_VARIABLES) + if gfile.Exists(dense_train_var_path): + with gfile.GFile(dense_train_var_path, 'r') as fin: + var_name_to_id_map = json.load(fin) + var_name_id_lst = [ (x, var_name_to_id_map[x]) for x in var_name_to_id_map ] + var_name_id_lst.sort(key=lambda x : x[1]) + all_vars = { x.op.name:x for x in tf.global_variables() } + for var_name, var_id in var_name_id_lst: + assert var_name in all_vars, 'dense_train_var[%s] is not found' % var_name + tf.add_to_collection(constant.DENSE_UPDATE_VARIABLES, all_vars[var_name]) + # add more asset files if 'asset_files' in params: for asset_name in params['asset_files']: @@ -505,7 +553,7 @@ def _write_rtp_fg_config_to_col(fg_config=None, fg_config_path=None): fg_config_path: path to the RTP config file. """ if fg_config is None: - with tf.gfile.GFile(fg_config_path, 'r') as f: + with gfile.GFile(fg_config_path, 'r') as f: fg_config = json.load(f) col = ops.get_collection_ref(GraphKeys.RANK_SERVICE_FG_CONF) if len(col) == 0: diff --git a/easy_rec/python/ops/1.12/incr_record.so b/easy_rec/python/ops/1.12/incr_record.so new file mode 100755 index 000000000..3f258e06e Binary files /dev/null and b/easy_rec/python/ops/1.12/incr_record.so differ diff --git a/easy_rec/python/ops/1.12/kafka.so b/easy_rec/python/ops/1.12/kafka.so index d5b33cc46..42164529a 100755 Binary files a/easy_rec/python/ops/1.12/kafka.so and b/easy_rec/python/ops/1.12/kafka.so differ diff --git a/easy_rec/python/ops/1.12/libembed_op.so b/easy_rec/python/ops/1.12/libembed_op.so index 5f46ee7f8..8a41da0b2 100644 Binary files a/easy_rec/python/ops/1.12/libembed_op.so and b/easy_rec/python/ops/1.12/libembed_op.so differ diff --git a/easy_rec/python/ops/1.12/librdkafka++.so.1 b/easy_rec/python/ops/1.12/librdkafka++.so.1 new file mode 100755 index 000000000..8a448378c Binary files /dev/null and b/easy_rec/python/ops/1.12/librdkafka++.so.1 differ diff --git a/easy_rec/python/ops/1.12/librdkafka.so.1 b/easy_rec/python/ops/1.12/librdkafka.so.1 new file mode 100755 index 000000000..c7ab65e96 Binary files /dev/null and b/easy_rec/python/ops/1.12/librdkafka.so.1 differ diff --git a/easy_rec/python/ops/1.12_pai/__init__.py b/easy_rec/python/ops/1.12_pai/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/easy_rec/python/ops/1.12_pai/incr_record.so b/easy_rec/python/ops/1.12_pai/incr_record.so new file mode 100755 index 000000000..e6c0d42b0 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/incr_record.so differ diff --git a/easy_rec/python/ops/1.12_pai/kafka.so b/easy_rec/python/ops/1.12_pai/kafka.so new file mode 100755 index 000000000..2df02c3a5 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/kafka.so differ diff --git a/easy_rec/python/ops/1.12_pai/kafka.so.bak b/easy_rec/python/ops/1.12_pai/kafka.so.bak new file mode 100755 index 000000000..e232014c0 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/kafka.so.bak differ diff --git a/easy_rec/python/ops/1.12_pai/libembed_op.so b/easy_rec/python/ops/1.12_pai/libembed_op.so new file mode 100644 index 000000000..5f46ee7f8 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libembed_op.so differ diff --git a/easy_rec/python/ops/1.12_pai/libhiredis.so.1.0.0 b/easy_rec/python/ops/1.12_pai/libhiredis.so.1.0.0 new file mode 100644 index 000000000..63ae04d40 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libhiredis.so.1.0.0 differ diff --git a/easy_rec/python/ops/1.12_pai/libkafka.so b/easy_rec/python/ops/1.12_pai/libkafka.so new file mode 100755 index 000000000..566ce198b Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libkafka.so differ diff --git a/easy_rec/python/ops/1.12_pai/librdkafka++.so.1 b/easy_rec/python/ops/1.12_pai/librdkafka++.so.1 new file mode 100755 index 000000000..8a448378c Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/librdkafka++.so.1 differ diff --git a/easy_rec/python/ops/1.12_pai/librdkafka.so.1 b/easy_rec/python/ops/1.12_pai/librdkafka.so.1 new file mode 100755 index 000000000..c7ab65e96 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/librdkafka.so.1 differ diff --git a/easy_rec/python/ops/1.12_pai/libredis++.so b/easy_rec/python/ops/1.12_pai/libredis++.so new file mode 100644 index 000000000..cadfccc27 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libredis++.so differ diff --git a/easy_rec/python/ops/1.12_pai/libredis++.so.1 b/easy_rec/python/ops/1.12_pai/libredis++.so.1 new file mode 100644 index 000000000..cadfccc27 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libredis++.so.1 differ diff --git a/easy_rec/python/ops/1.12_pai/libredis++.so.1.2.3 b/easy_rec/python/ops/1.12_pai/libredis++.so.1.2.3 new file mode 100644 index 000000000..cadfccc27 Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libredis++.so.1.2.3 differ diff --git a/easy_rec/python/ops/1.12_pai/libwrite_sparse_kv.so b/easy_rec/python/ops/1.12_pai/libwrite_sparse_kv.so new file mode 100755 index 000000000..d50ee8edc Binary files /dev/null and b/easy_rec/python/ops/1.12_pai/libwrite_sparse_kv.so differ diff --git a/easy_rec/python/ops/1.15/incr_record.so b/easy_rec/python/ops/1.15/incr_record.so new file mode 100755 index 000000000..e92ea0f36 Binary files /dev/null and b/easy_rec/python/ops/1.15/incr_record.so differ diff --git a/easy_rec/python/ops/1.15/kafka.so b/easy_rec/python/ops/1.15/kafka.so index 3ba64834a..6886446d8 100755 Binary files a/easy_rec/python/ops/1.15/kafka.so and b/easy_rec/python/ops/1.15/kafka.so differ diff --git a/easy_rec/python/ops/1.15/libembed_op.so b/easy_rec/python/ops/1.15/libembed_op.so index 69d396100..4d8f6275c 100755 Binary files a/easy_rec/python/ops/1.15/libembed_op.so and b/easy_rec/python/ops/1.15/libembed_op.so differ diff --git a/easy_rec/python/ops/1.15/librdkafka++.so b/easy_rec/python/ops/1.15/librdkafka++.so new file mode 100755 index 000000000..969f8ab1d Binary files /dev/null and b/easy_rec/python/ops/1.15/librdkafka++.so differ diff --git a/easy_rec/python/ops/1.15/librdkafka++.so.1 b/easy_rec/python/ops/1.15/librdkafka++.so.1 new file mode 100755 index 000000000..969f8ab1d Binary files /dev/null and b/easy_rec/python/ops/1.15/librdkafka++.so.1 differ diff --git a/easy_rec/python/ops/1.15/librdkafka.so b/easy_rec/python/ops/1.15/librdkafka.so new file mode 100755 index 000000000..c83248971 Binary files /dev/null and b/easy_rec/python/ops/1.15/librdkafka.so differ diff --git a/easy_rec/python/ops/1.15/librdkafka.so.1 b/easy_rec/python/ops/1.15/librdkafka.so.1 new file mode 100755 index 000000000..c83248971 Binary files /dev/null and b/easy_rec/python/ops/1.15/librdkafka.so.1 differ diff --git a/easy_rec/python/ops/gen_kafka_ops.py b/easy_rec/python/ops/gen_kafka_ops.py new file mode 100644 index 000000000..d971f4563 --- /dev/null +++ b/easy_rec/python/ops/gen_kafka_ops.py @@ -0,0 +1,189 @@ +"""Python wrappers around TensorFlow ops. + +This file is MACHINE GENERATED! Do not edit. +Original C++ source file: kafka_ops_deprecated.cc +""" + +import os +import logging + +import six as _six +import tensorflow as tf +from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow +from tensorflow.python.eager import context as _context +from tensorflow.python.eager import core as _core +from tensorflow.python.eager import execute as _execute + +# Needed to trigger the call to _set_call_cpp_shape_fn. +from tensorflow.python.framework import dtypes as _dtypes +from tensorflow.python.framework import ops as _ops +from tensorflow.python.util.tf_export import tf_export +import easy_rec + + +try: + kafka_module = tf.load_op_library(os.path.join(easy_rec.ops_dir, 'kafka.so')) +except Exception as ex: + logging.error("failed to load kafka.so: %s" % str(ex)) + kafka_module = None + + +@tf_export('io_kafka_dataset_v2') +def io_kafka_dataset_v2(topics, + servers, + group, + eof, + timeout, + config_global, + config_topic, + message_key, + message_offset, + name=None): + """Creates a dataset that emits the messages of one or more Kafka topics. + + Args: + topics: A `Tensor` of type `string`. + A `tf.string` tensor containing one or more subscriptions, + in the format of [topic:partition:offset]. + servers: A `Tensor` of type `string`. A list of bootstrap servers. + group: A `Tensor` of type `string`. The consumer group id. + eof: A `Tensor` of type `bool`. + If True, the kafka reader will stop on EOF. + timeout: A `Tensor` of type `int64`. + The timeout value for the Kafka Consumer to wait + (in millisecond). + config_global: A `Tensor` of type `string`. + A `tf.string` tensor containing global configuration + properties in [Key=Value] format, + eg. ["enable.auto.commit=false", "heartbeat.interval.ms=2000"], + please refer to 'Global configuration properties' in librdkafka doc. + config_topic: A `Tensor` of type `string`. + A `tf.string` tensor containing topic configuration + properties in [Key=Value] format, eg. ["auto.offset.reset=earliest"], + please refer to 'Topic configuration properties' in librdkafka doc. + message_key: A `Tensor` of type `bool`. + message_offset: A `Tensor` of type `bool`. + name: A name for the operation (optional). + + Returns: + A `Tensor` of type `variant`. + """ + return kafka_module.io_kafka_dataset_v2( + topics=topics, + servers=servers, + group=group, + eof=eof, + timeout=timeout, + config_global=config_global, + config_topic=config_topic, + message_key=message_key, + message_offset=message_offset, + name=name) + + +def io_kafka_dataset_eager_fallback(topics, + servers, + group, + eof, + timeout, + config_global, + config_topic, + message_key, + message_offset, + name=None, + ctx=None): + """This is the slowpath function for Eager mode. + + This is for function io_kafka_dataset + """ + _ctx = ctx if ctx else _context.context() + topics = _ops.convert_to_tensor(topics, _dtypes.string) + servers = _ops.convert_to_tensor(servers, _dtypes.string) + group = _ops.convert_to_tensor(group, _dtypes.string) + eof = _ops.convert_to_tensor(eof, _dtypes.bool) + timeout = _ops.convert_to_tensor(timeout, _dtypes.int64) + config_global = _ops.convert_to_tensor(config_global, _dtypes.string) + config_topic = _ops.convert_to_tensor(config_topic, _dtypes.string) + message_key = _ops.convert_to_tensor(message_key, _dtypes.bool) + message_offset = _ops.convert_to_tensor(message_offset, _dtypes.bool) + _inputs_flat = [ + topics, servers, group, eof, timeout, config_global, config_topic, + message_key, message_offset + ] + _attrs = None + _result = _execute.execute( + b'IOKafkaDataset', + 1, + inputs=_inputs_flat, + attrs=_attrs, + ctx=_ctx, + name=name) + _execute.record_gradient('IOKafkaDataset', _inputs_flat, _attrs, _result, + name) + _result, = _result + return _result + + +@tf_export('io_write_kafka_v2') +def io_write_kafka_v2(message, topic, servers, name=None): + r"""TODO: add doc. + + Args: + message: A `Tensor` of type `string`. + topic: A `Tensor` of type `string`. + servers: A `Tensor` of type `string`. + name: A name for the operation (optional). + + Returns: + A `Tensor` of type `string`. + """ + _ctx = _context._context + if _ctx is None or not _ctx._eager_context.is_eager: + _op = kafka_module.io_write_kafka_v2( + message=message, topic=topic, servers=servers, name=name) + _result = _op.outputs[:] + _inputs_flat = _op.inputs + _attrs = None + _execute.record_gradient('IOWriteKafka', _inputs_flat, _attrs, _result, + name) + _result, = _result + return _result + + else: + try: + _result = _pywrap_tensorflow.TFE_Py_FastPathExecute( + _ctx._context_handle, _ctx._eager_context.device_name, 'IOWriteKafka', + name, _ctx._post_execution_callbacks, message, topic, servers) + return _result + except _core._FallbackException: + return io_write_kafka_eager_fallback( + message, topic, servers, name=name, ctx=_ctx) + except _core._NotOkStatusException as e: + if name is not None: + message = e.message + ' name: ' + name + else: + message = e.message + _six.raise_from(_core._status_to_exception(e.code, message), None) + + +def io_write_kafka_eager_fallback(message, topic, servers, name=None, ctx=None): + """This is the slowpath function for Eager mode. + + This is for function io_write_kafka + """ + _ctx = ctx if ctx else _context.context() + message = _ops.convert_to_tensor(message, _dtypes.string) + topic = _ops.convert_to_tensor(topic, _dtypes.string) + servers = _ops.convert_to_tensor(servers, _dtypes.string) + _inputs_flat = [message, topic, servers] + _attrs = None + _result = _execute.execute( + b'IOWriteKafka', + 1, + inputs=_inputs_flat, + attrs=_attrs, + ctx=_ctx, + name=name) + _execute.record_gradient('IOWriteKafka', _inputs_flat, _attrs, _result, name) + _result, = _result + return _result diff --git a/easy_rec/python/ops/incr_record.py b/easy_rec/python/ops/incr_record.py new file mode 100644 index 000000000..eee1f9e17 --- /dev/null +++ b/easy_rec/python/ops/incr_record.py @@ -0,0 +1,20 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import logging +import easy_rec +import tensorflow as tf + +try: + op_path = os.path.join(easy_rec.ops_dir, "incr_record.so") + op = tf.load_op_library(op_path) + get_sparse_indices = op.get_sparse_indices + set_sparse_indices = op.set_sparse_indices +except ImportError as ex: + get_sparse_indices = None + set_sparse_indices = None + logging.warning('failed to import gen_io_ops.collect_sparse_indices: %s' % str(ex)) +except Exception as ex: + get_sparse_indices = None + set_sparse_indices = None + logging.warning('failed to import gen_io_ops.collect_sparse_indices: %s' % str(ex)) diff --git a/easy_rec/python/protos/data_source.proto b/easy_rec/python/protos/data_source.proto index a05134d12..d30e04a96 100644 --- a/easy_rec/python/protos/data_source.proto +++ b/easy_rec/python/protos/data_source.proto @@ -5,8 +5,12 @@ message KafkaServer { required string server = 1; required string topic = 2; required string group = 3; - required uint32 partitions = 4; - repeated uint32 offset = 5; + // in json format: {'0':10, '1':20} + optional string offset_info = 4; + // kafka global config, such as: fetch.max.bytes=1024 + repeated string config_global = 5; + // kafka topic config, such as: max.partition.fetch.bytes=1024 + repeated string config_topic = 6; } message DatahubServer{ @@ -15,6 +19,5 @@ message DatahubServer{ required string region = 3; required string project = 4; required string topic = 5; - required uint32 shard_num = 6; - required uint32 life_cycle = 7; + optional string offset_info = 6; } diff --git a/easy_rec/python/protos/train.proto b/easy_rec/python/protos/train.proto index b86e37a72..536fc614d 100644 --- a/easy_rec/python/protos/train.proto +++ b/easy_rec/python/protos/train.proto @@ -21,6 +21,44 @@ enum DistributionStrategy { MultiWorkerMirroredStrategy = 5; } +message IncrementSaveConfig { + message Kafka { + message Consumer { + optional string config_topic = 1; + optional string config_global = 2; + optional int64 offset = 3 [default=0]; + optional int32 timeout = 4 [default=600]; + } + required string server = 1; + required string topic = 2; + required Consumer consumer = 3; + } + + message Datahub { + message Consumer { + optional int64 offset = 1 [default=0]; + optional int32 timeout = 2 [default=600]; + } + required string akId = 1; + required string akSecret = 2; + required string region = 3; + required string project = 4; + required string topic = 5; + required Consumer consumer = 6; + } + + + optional int32 sparse_save_secs = 1 [default=0]; + optional int32 dense_save_secs = 2 [default=0]; + optional int32 sparse_save_steps = 3 [default=0]; + optional int32 dense_save_steps = 4 [default=0]; + + oneof incr_update_hub { + Kafka kafka = 501; + Datahub datahub = 502; + } +} + // Message for configuring EasyRecModel training jobs (train.py). // Next id: 25 message TrainConfig { @@ -107,4 +145,11 @@ message TrainConfig { // match variable patterns to freeze repeated string freeze_gradient = 30; + + // increment save config + optional IncrementSaveConfig incr_save_config = 31; + + // enable oss stop signal + // stop by create OSS_STOP_SIGNAL under model_dir + optional bool enable_oss_stop_signal = 32 [default = false]; } diff --git a/easy_rec/python/test/dh_local_run.py b/easy_rec/python/test/dh_local_run.py index a4282f891..c22fbaddb 100644 --- a/easy_rec/python/test/dh_local_run.py +++ b/easy_rec/python/test/dh_local_run.py @@ -37,7 +37,8 @@ def test_datahub_train_eval(self): odps_cmd = OdpsCommand(odps_oss_config) self._success = test_utils.test_datahub_train_eval( - '%s/configs/deepfm.config' % odps_oss_config.temp_dir, self._test_dir) + '%s/configs/deepfm.config' % odps_oss_config.temp_dir, + odps_oss_config, self._test_dir) odps_cmd.run_list(end) self.assertTrue(self._success) @@ -48,8 +49,6 @@ def test_datahub_train_eval(self): '--odps_config', type=str, default=None, help='odps config path') parser.add_argument( '--oss_config', type=str, default=None, help='ossutilconfig path') - parser.add_argument( - '--datahub_config', type=str, default=None, help='datahub_config') parser.add_argument( '--bucket_name', type=str, default=None, help='test oss bucket name') parser.add_argument('--arn', type=str, default=None, help='oss rolearn') @@ -73,8 +72,6 @@ def test_datahub_train_eval(self): if args.odps_config: odps_oss_config.load_odps_config(args.odps_config) os.environ['ODPS_CONFIG_FILE_PATH'] = args.odps_config - if args.datahub_config: - odps_oss_config.load_dh_config(args.datahub_config) if args.oss_config: odps_oss_config.load_oss_config(args.oss_config) if args.odpscmd: @@ -89,7 +86,6 @@ def test_datahub_train_eval(self): odps_oss_config.arn = args.arn if args.bucket_name: odps_oss_config.bucket_name = args.bucket_name - print(args) prepare(odps_oss_config) start = [ 'deep_fm/create_external_deepfm_table.sql', diff --git a/easy_rec/python/test/export_test.py b/easy_rec/python/test/export_test.py index 05b0c4aa9..99f5e2f0c 100644 --- a/easy_rec/python/test/export_test.py +++ b/easy_rec/python/test/export_test.py @@ -440,8 +440,7 @@ def _test_big_model_export_to_oss(self, --input_path %s --output_path %s """ % (config_path, test_data_path, result_path) - proc = test_utils.run_cmd(predict_cmd % (), - '%s/log_%s.txt' % (test_dir, 'predict')) + proc = test_utils.run_cmd(predict_cmd, '%s/log_%s.txt' % (test_dir, 'predict')) proc.wait() self.assertTrue(proc.returncode == 0) with open(result_path, 'r') as fin: diff --git a/easy_rec/python/test/kafka_test.py b/easy_rec/python/test/kafka_test.py new file mode 100644 index 000000000..8dfd8a23e --- /dev/null +++ b/easy_rec/python/test/kafka_test.py @@ -0,0 +1,317 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. + +import numpy as np +import os +import json +import time +import logging +import unittest +import traceback +import threading +import six + +import tensorflow as tf +from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.platform import gfile + +from easy_rec.python.inference.predictor import Predictor +from easy_rec.python.input.kafka_dataset import KafkaDataset +from easy_rec.python.utils import test_utils + +try: + import kafka + from kafka import KafkaProducer, KafkaAdminClient + from kafka.admin import NewTopic +except ImportError as ex: + logging.warning('kafka-python is not installed: %s' % traceback.format_exc(ex)) + + +class KafkaTest(tf.test.TestCase): + + def setUp(self): + self._success = True + self._test_dir = test_utils.get_tmp_dir() + if self._testMethodName == 'test_session': + self._kafka_server_proc = None + self._zookeeper_proc = None + return + + logging.info('Testing %s.%s, test_dir=%s' % (type(self).__name__, self._testMethodName, + self._test_dir)) + self._log_dir = os.path.join(self._test_dir, 'logs') + if not gfile.IsDirectory(self._log_dir): + gfile.MakeDirs(self._log_dir) + + self._kafka_servers = ['127.0.0.1:9092'] + self._test_topic = 'kafka_op_test_topic' + + if 'kafka_install_dir' in os.environ: + kafka_install_dir = os.environ.get('kafka_install_dir', None) + + zookeeper_config_raw = '%s/config/zookeeper.properties' % kafka_install_dir + zookeeper_config = os.path.join(self._test_dir, 'zookeeper.properties') + with open(zookeeper_config, 'w') as fout: + with open(zookeeper_config_raw, 'r') as fin: + for line_str in fin: + if line_str.startswith('dataDir='): + fout.write('dataDir=%s/zookeeper\n' % self._test_dir) + else: + fout.write(line_str) + cmd = 'bash %s/bin/zookeeper-server-start.sh %s' % ( + kafka_install_dir, zookeeper_config) + log_file = os.path.join(self._log_dir, 'zookeeper.log') + self._zookeeper_proc = test_utils.run_cmd(cmd, log_file) + + kafka_config_raw = '%s/config/server.properties' % kafka_install_dir + kafka_config = os.path.join(self._test_dir, 'server.properties') + with open(kafka_config, 'w') as fout: + with open(kafka_config_raw, 'r') as fin: + for line_str in fin: + if line_str.startswith('log.dirs='): + fout.write('log.dirs=%s/kafka\n' % self._test_dir) + else: + fout.write(line_str) + cmd = 'bash %s/bin/kafka-server-start.sh %s' % ( + kafka_install_dir, kafka_config) + log_file = os.path.join(self._log_dir, 'kafka_server.log') + self._kafka_server_proc = test_utils.run_cmd(cmd, log_file) + + started = False + while not started: + if self._kafka_server_proc.poll() and self._kafka_server_proc.returncode: + logging.warning('start kafka server failed, will retry.') + os.system('cat %s' % log_file) + self._kafka_server_proc = test_utils.run_cmd(cmd, log_file) + time.sleep(5) + else: + try: + admin_clt = KafkaAdminClient(bootstrap_servers=self._kafka_servers) + logging.info('old topics: %s' % (','.join(admin_clt.list_topics()))) + admin_clt.close() + started = True + except kafka.errors.NoBrokersAvailable: + time.sleep(2) + self._create_topic() + else: + self._zookeeper_proc = None + self._kafka_server_proc = None + self._should_stop = False + self._producer = None + + def _create_topic(self, num_partitions=2): + admin_clt = KafkaAdminClient(bootstrap_servers=self._kafka_servers) + + logging.info('create topic: %s' % self._test_topic) + topic_list = [NewTopic(name=self._test_topic, num_partitions=num_partitions, + replication_factor=1)] + admin_clt.create_topics(new_topics=topic_list, validate_only=False) + logging.info('all topics: %s' % (','.join(admin_clt.list_topics()))) + admin_clt.close() + + def _create_producer(self, generate_func): + # start produce thread + + prod = threading.Thread(target=generate_func) + prod.start() + return prod + + def _stop_producer(self): + if self._producer is not None: + self._should_stop = True + self._producer.join() + + def tearDown(self): + try: + self._stop_producer() + if self._kafka_server_proc is not None: + self._kafka_server_proc.terminate() + except Exception as ex: + logging.warning('exception terminate kafka proc: %s' % str(ex)) + + try: + if self._zookeeper_proc is not None: + self._zookeeper_proc.terminate() + except Exception as ex: + logging.warning('exception terminate zookeeper proc: %s' % str(ex)) + + test_utils.set_gpu_id(None) + if self._success: + test_utils.clean_up(self._test_dir) + + @unittest.skipIf('kafka_install_dir' not in os.environ, 'Only execute when kafka is available') + def test_kafka_ops(self): + try: + test_utils.set_gpu_id(None) + + def _generate(): + producer = KafkaProducer( + bootstrap_servers=self._kafka_servers, api_version=(0, 10, 1)) + i = 0 + while not self._should_stop: + msg = 'user_id_%d' % i + producer.send(self._test_topic, msg) + producer.close() + + self._producer = self._create_producer(_generate) + + group = 'dataset_consumer' + k = KafkaDataset( + servers=self._kafka_servers[0], + topics=[self._test_topic + ':0', self._test_topic + ':1'], + group=group, + eof=True, + # control the maximal read of each partition + config_global=['max.partition.fetch.bytes=1048576'], + message_key=True, + message_offset=True) + + batch_dataset = k.batch(5) + + iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types) + init_batch_op = iterator.make_initializer(batch_dataset) + get_next = iterator.get_next() + + sess = tf.Session() + sess.run(init_batch_op) + + p = sess.run(get_next) + + self.assertEquals(len(p), 3) + offset = p[2] + self.assertEquals(offset[0], '0:0') + self.assertEquals(offset[1], '0:1') + + p = sess.run(get_next) + offset = p[2] + self.assertEquals(offset[0], '0:5') + self.assertEquals(offset[1], '0:6') + + max_iter = 300 + while max_iter > 0: + sess.run(get_next) + max_iter -= 1 + except tf.errors.OutOfRangeError as ex: + pass + except Exception as ex: + self._success = False + raise ex + + @unittest.skipIf('kafka_install_dir' not in os.environ, 'Only execute when kafka is available') + def test_kafka_train(self): + try: + # start produce thread + def _generate(): + producer = KafkaProducer( + bootstrap_servers=self._kafka_servers, api_version=(0, 10, 1)) + while not self._should_stop: + with open('data/test/dwd_avazu_ctr_deepmodel_10w.csv', 'r') as fin: + for line_str in fin: + line_str = line_str.strip() + if self._should_stop: + break + if six.PY3: + line_str = line_str.encode('utf-8') + producer.send(self._test_topic, line_str) + producer.close() + logging.info('data generation thread done.') + + self._producer = self._create_producer(_generate) + + test_utils.set_gpu_id(None) + + self._success = test_utils.test_single_train_eval( + 'samples/model_config/deepfm_combo_avazu_kafka.config', self._test_dir) + + self.assertTrue(self._success) + except Exception as ex: + self._success = False + raise ex + + @unittest.skipIf('kafka_install_dir' not in os.environ, 'Only execute when kafka is available') + def test_kafka_train_v2(self): + try: + # start produce thread + def _generate(): + producer = KafkaProducer( + bootstrap_servers=self._kafka_servers, api_version=(0, 10, 1)) + while not self._should_stop: + with open('data/test/dwd_avazu_ctr_deepmodel_10w.csv', 'r') as fin: + for line_str in fin: + line_str = line_str.strip() + if self._should_stop: + break + if six.PY3: + line_str = line_str.encode('utf-8') + producer.send(self._test_topic, line_str) + producer.close() + logging.info('data generation thread done.') + + self._producer = self._create_producer(_generate) + + test_utils.set_gpu_id(None) + + self._success = test_utils.test_single_train_eval( + 'samples/model_config/deepfm_combo_avazu_kafka_time_offset.config', self._test_dir) + + self.assertTrue(self._success) + except Exception as ex: + self._success = False + raise ex + + @unittest.skipIf('kafka_install_dir' not in os.environ or 'oss_path' not in os.environ \ + or 'oss_endpoint' not in os.environ and 'oss_ak' not in os.environ \ + or 'oss_sk' not in os.environ, 'Only execute when kafka is available') + def test_kafka_processor(self): + self._success = False + success = test_utils.test_distributed_train_eval( + 'samples/model_config/taobao_fg_incr_save.config', self._test_dir) + self.assertTrue(success) + export_cmd = """ + python -m easy_rec.python.export --pipeline_config_path %s/pipeline.config + --export_dir %s/export/sep/ --oss_path=%s --oss_ak=%s --oss_sk=%s --oss_endpoint=%s + --asset_files ./samples/rtp_fg/fg.json + --checkpoint_path %s/train/model.ckpt-0 + """ % (self._test_dir, self._test_dir, os.environ['oss_path'], os.environ['oss_ak'], + os.environ['oss_sk'], os.environ['oss_endpoint'], self._test_dir) + proc = test_utils.run_cmd(export_cmd, '%s/log_export_sep.txt' % self._test_dir) + proc.wait() + self.assertTrue(proc.returncode == 0) + files = gfile.Glob(os.path.join(self._test_dir, 'export/sep/[1-9][0-9]*')) + export_sep_dir = files[0] + + predict_cmd = """ + python processor/test.py --saved_model_dir %s + --input_path data/test/rtp/taobao_test_feature.txt + --output_path %s/processor.out + --data_config processor/dataset.config + """ % (export_sep_dir, self._test_dir) + proc = test_utils.run_cmd(predict_cmd, '%s/log_processor.txt' % self._test_dir) + proc.wait() + self.assertTrue(proc.returncode == 0) + + with open('%s/processor.out' % self._test_dir, 'r') as fin: + processor_out = [] + for line_str in fin: + line_str = line_str.strip() + processor_out.append(json.loads(line_str)) + + predictor = Predictor(os.path.join(self._test_dir, 'train/export/final')) + with open('data/test/rtp/taobao_test_feature.txt', 'r') as fin: + inputs = [] + for line_str in fin: + line_str = line_str.strip() + line_tok = line_str.split(';')[-1] + line_tok = line_tok.split(chr(2)) + inputs.append(line_tok) + output_res = predictor.predict(inputs, batch_size=32) + + for i in range(len(output_res)): + val0 = output_res[i]['probs'] + val1 = processor_out[i]['probs'] + diff = np.abs(val0 - val1) + assert diff < 1e-4, 'too much difference[%.6f] >= 1e-4' % diff + self._success = True + +if __name__ == '__main__': + tf.test.main() diff --git a/easy_rec/python/test/odps_test_prepare.py b/easy_rec/python/test/odps_test_prepare.py index 775b6d1ce..de5294a0d 100644 --- a/easy_rec/python/test/odps_test_prepare.py +++ b/easy_rec/python/test/odps_test_prepare.py @@ -138,7 +138,7 @@ def put_data_to_bucket(odps_oss_config): odps_oss_config.oss_secret, odps_oss_config.endpoint, odps_oss_config.bucket_name) - for sub_dir in ['configs']: #, 'test_data']: + for sub_dir in ['configs']: for root, dirs, files in os.walk( os.path.join(odps_oss_config.temp_dir, sub_dir)): for one_file in files: diff --git a/easy_rec/python/test/odps_test_util.py b/easy_rec/python/test/odps_test_util.py index f9bb1965f..cc7f9781c 100644 --- a/easy_rec/python/test/odps_test_util.py +++ b/easy_rec/python/test/odps_test_util.py @@ -59,16 +59,16 @@ def __init__(self, script_path='./samples/odps_script'): self.odpscmd_path = os.environ.get('ODPS_CMD_PATH', 'odpscmd') self.odps_config_path = '' - # input table project name replace {ODPS_PROJ_NAME} in - # samples/odps_script: - # grep ODPS_PROJ_NAME -r samples/odps_script/ + self.project_name = '' self.dh_id = '' self.dh_key = '' - self.dh_endpoint = '' - self.dh_topic = '' - self.dh_project = '' + + self.dh_endpoint = 'https://dh-cn-beijing.aliyuncs.com' + self.dh_topic = 'easy_rec_test' + self.dh_project = 'easy_rec_test' + self.odps_endpoint = '' self.dh = None @@ -83,17 +83,6 @@ def __init__(self, script_path='./samples/odps_script'): # the difference are ossHost buckets arn settings self.is_outer = True - def load_dh_config(self, config_path): - import pdb - pdb.set_trace() - configer = configparser.ConfigParser() - configer.read(config_path, encoding='utf-8') - self.dh_id = configer.get('datahub', 'access_id') - self.dh_key = configer.get('datahub', 'access_key') - self.dh_endpoint = configer.get('datahub', 'endpoint') - self.dh_topic = configer.get('datahub', 'topic_name') - self.dh_project = configer.get('datahub', 'project') - def load_oss_config(self, config_path): with open(config_path, 'r') as fin: for line_str in fin: @@ -112,10 +101,18 @@ def load_odps_config(self, config_path): for line_str in fin: line_str = line_str.strip() line_str = line_str.replace(' ', '') - if line_str.startswith('project_name='): - self.project_name = line_str[len('project_name='):] - if line_str.startswith('end_point='): - self.odps_endpoint = line_str[len('end_point='):] + key_str = 'project_name=' + if line_str.startswith(key_str): + self.project_name = line_str[len(key_str):] + key_str = 'end_point=' + if line_str.startswith(key_str): + self.odps_endpoint = line_str[len(key_str):] + key_str = 'access_id=' + if line_str.startswith(key_str): + self.dh_id = line_str[len(key_str):] + key_str = 'access_key=' + if line_str.startswith(key_str): + self.dh_key = line_str[len(key_str):] def clean_topic(self, dh_project): if not dh_project: @@ -160,47 +157,44 @@ def init_dh_and_odps(self): self.odpsTable = 'deepfm_train_%s' % self.time_stamp self.clean_project() read_odps = DataFrame(self.odps.get_table(self.odpsTable)) - col = read_odps.schema.names + col_name = read_odps.schema.names col_type = [self.get_input_type(str(i)) for i in read_odps.schema.types] try: - self.dh.create_project(self.dh_project, 'EasyRecTest') + self.dh.create_project(self.dh_project, comment='EasyRecTest') logging.info('create project success!') except ResourceExistException: - logging.info('project %s already exist!' % self.dh_project) + logging.warning('project %s already exist!' % self.dh_project) except Exception as ex: - logging.info(traceback.format_exc(ex)) - record_schema = RecordSchema.from_lists(col, col_type) + logging.error(traceback.format_exc(ex)) + record_schema = RecordSchema.from_lists(col_name, col_type) try: # project_name, topic_name, shard_count, life_cycle, record_schema, comment self.dh.create_tuple_topic(self.dh_project, self.dh_topic, 7, 3, - record_schema, 'easyrec_datahub') - logging.info('create tuple topic success!') + record_schema, comment='EasyRecTest') + logging.info('create tuple topic %s success!' % self.dh_topic) except ResourceExistException: logging.info('topic %s already exist!' % self.dh_topic) except Exception as ex: - logging.error('exception:', ex) + logging.error('exception:%s' % str(ex)) logging.error(traceback.format_exc()) try: self.dh.wait_shards_ready(self.dh_project, self.dh_topic) - logging.info('shards all ready') + logging.info('datahub[%s,%s] shards all ready' % (self.dh_project, self.dh_topic)) topic_result = self.dh.get_topic(self.dh_project, self.dh_topic) if topic_result.record_type != RecordType.TUPLE: - logging.error('topic type illegal! ') + logging.error('invalid topic type: %s' % str(topic_result.record_type)) record_schema = topic_result.record_schema t = self.odps.get_table(self.odpsTable) with t.open_reader() as reader: - size = 0 record_list = [] - for data in reader[0:1000]: + for data in reader: record = TupleRecord(values=data.values, schema=record_schema) record_list.append(record) - if size % 1000: - self.dh.put_records(self.dh_project, self.dh_topic, record_list) - record_list = [] - size += 1 - except Exception as e: - logging.error(e) - + for i in range(10): + self.dh.put_records(self.dh_project, self.dh_topic, record_list) + except Exception as ex: + logging.error('exception: %s' % str(ex)) + logging.error(traceback.format_exc()) def get_oss_bucket(oss_key, oss_secret, endpoint, bucket_name): """Build oss2.Bucket instance. diff --git a/easy_rec/python/tools/predict_and_chk.py b/easy_rec/python/tools/predict_and_chk.py index 51fa945be..0adf2724f 100644 --- a/easy_rec/python/tools/predict_and_chk.py +++ b/easy_rec/python/tools/predict_and_chk.py @@ -4,11 +4,19 @@ import json import logging import sys +import os +import easy_rec import numpy as np from easy_rec.python.inference.predictor import Predictor +try: + import tensorflow as tf + tf.load_op_library(os.path.join(easy_rec.ops_dir, 'libembed_op.so')) +except Exception as ex: + logging.warning('exception: %s' % str(ex)) + logging.basicConfig( level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s') @@ -77,7 +85,7 @@ x for fid, x in enumerate(feature.split(args.separator)) if fid not in args.label_id ] - if len(predictor.input_names) == 1: + if 'features' in predictor.input_names: feature = args.separator.join(feature) batch_input.append(feature) output = predictor.predict(batch_input) diff --git a/easy_rec/python/tools/read_kafka.py b/easy_rec/python/tools/read_kafka.py new file mode 100644 index 000000000..27ccc35f0 --- /dev/null +++ b/easy_rec/python/tools/read_kafka.py @@ -0,0 +1,46 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import sys +import logging +import argparse +from kafka import KafkaProducer +from kafka import KafkaConsumer, KafkaProducer, KafkaAdminClient +from kafka.admin import NewTopic +from kafka.structs import TopicPartition + +logging.basicConfig( + level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--servers', type=str, default='localhost:9092') + parser.add_argument('--topic', type=str, default=None) + parser.add_argument('--group', type=str, default='consumer') + parser.add_argument('--partitions', type=str, default=None) + parser.add_argument('--timeout', type=float, default=float('inf')) + args = parser.parse_args() + + if args.topic is None: + logging.error('--topic is not set') + sys.exit(1) + + servers = args.servers.split(',') + consumer = KafkaConsumer(group_id=args.group, bootstrap_servers=servers, + consumer_timeout_ms=args.timeout * 1000) + + if args.partitions is not None: + partitions = [ int(x) for x in args.partitions.split(',') ] + else: + partitions = consumer.partitions_for_topic(args.topic) + logging.info('partitions: %s' % partitions) + + topics = [ TopicPartition(topic=args.topic, partition=part_id) \ + for part_id in partitions ] + consumer.assign(topics) + consumer.seek_to_beginning() + + record_id = 0 + for x in consumer: + logging.info("%d: key=%s\toffset=%d\ttimestamp=%d\tlen=%d" % (record_id, x.key, x.offset, + x.timestamp, len(x.value))) + record_id += 1 diff --git a/easy_rec/python/tools/write_kafka.py b/easy_rec/python/tools/write_kafka.py new file mode 100644 index 000000000..8a2fbe2b2 --- /dev/null +++ b/easy_rec/python/tools/write_kafka.py @@ -0,0 +1,57 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +from kafka import KafkaProducer +from kafka import KafkaConsumer, KafkaProducer, KafkaAdminClient +from kafka.admin import NewTopic +from kafka.structs import TopicPartition +import time +import sys +import logging +import argparse + +logging.basicConfig( + level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--servers', type=str, default='localhost:9092') + parser.add_argument('--topic', type=str, default=None) + parser.add_argument('--group', type=str, default='consumer') + parser.add_argument('--partitions', type=str, default=None) + parser.add_argument('--timeout', type=float, default=float('inf')) + # file to send + parser.add_argument('--input_path', type=str, default=None) + args = parser.parse_args() + + if args.input_path is None: + logging.error('input_path is not set') + sys.exit(1) + + if args.topic is None: + logging.error('topic is not set') + sys.exit(1) + + servers = args.servers.split(',') + + admin_clt = KafkaAdminClient(bootstrap_servers=servers) + if args.topic not in admin_clt.list_topics(): + admin_clt.create_topics(new_topics=[NewTopic(name=args.topic, + num_partitions=1, replication_factor=1, + topic_configs={'max.message.bytes': 1024 * 1024 * 1024})], + validate_only=False) + logging.info('create increment save topic: %s' % args.topic) + admin_clt.close() + + producer = KafkaProducer( + bootstrap_servers=servers, + request_timeout_ms=args.timeout * 1000, + api_version=(0, 10, 1)) + + i = 1 + with open(args.input_path, 'r') as fin: + for line_str in fin: + producer.send(args.topic, line_str.encode('utf-8')) + i += 1 + if i % 100 == 0: + logging.info('progress: %d' % i) + producer.close() diff --git a/easy_rec/python/utils/constant.py b/easy_rec/python/utils/constant.py index 9df831a89..8caecaba8 100644 --- a/easy_rec/python/utils/constant.py +++ b/easy_rec/python/utils/constant.py @@ -2,3 +2,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. SAMPLE_WEIGHT = 'SAMPLE_WEIGHT' + +DENSE_UPDATE_VARIABLES = 'DENSE_UPDATE_VARIABLES' + +SPARSE_UPDATE_VARIABLES = 'SPARSE_UPDATE_VARIABLES' diff --git a/easy_rec/python/utils/embedding_utils.py b/easy_rec/python/utils/embedding_utils.py new file mode 100644 index 000000000..2e0497f5e --- /dev/null +++ b/easy_rec/python/utils/embedding_utils.py @@ -0,0 +1,45 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import tensorflow as tf +from easy_rec.python.utils import proto_util +from easy_rec.python.utils import constant +from tensorflow.python.framework import ops + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + + +def get_norm_name_to_ids(): + """Get normalize embedding name(including kv variables) to ids. + + Return: + normalized names to ids mapping. + """ + norm_name_to_ids = {} + for x in ops.get_collection(constant.SPARSE_UPDATE_VARIABLES): + norm_name, part_id = proto_util.get_norm_embed_name(x[0].name) + norm_name_to_ids[norm_name] = 1 + + for tid, t in enumerate(norm_name_to_ids.keys()): + norm_name_to_ids[t] = str(tid) + return norm_name_to_ids + +def get_sparse_name_to_ids(): + """Get embedding variable(including kv variables) name to ids mapping. + + Return: + variable names to ids mappping. + """ + norm_name_to_ids = get_norm_name_to_ids() + name_to_ids = {} + for x in ops.get_collection(constant.SPARSE_UPDATE_VARIABLES): + norm_name, _ = proto_util.get_norm_embed_name(x[0].name) + name_to_ids[x[0].name] = norm_name_to_ids[norm_name] + return name_to_ids + +def get_dense_name_to_ids(): + dense_train_vars = ops.get_collection(constant.DENSE_UPDATE_VARIABLES) + norm_name_to_ids = {} + for tid, x in enumerate(dense_train_vars): + norm_name_to_ids[x.op.name] = tid + return norm_name_to_ids diff --git a/easy_rec/python/utils/estimator_utils.py b/easy_rec/python/utils/estimator_utils.py index e406d1c73..950118ff8 100644 --- a/easy_rec/python/utils/estimator_utils.py +++ b/easy_rec/python/utils/estimator_utils.py @@ -13,12 +13,30 @@ import numpy as np import six +import threading import tensorflow as tf +from tensorflow.python.framework import ops +from easy_rec.python.ops.incr_record import get_sparse_indices +from tensorflow.python.ops import array_ops from tensorflow.core.framework.summary_pb2 import Summary from tensorflow.python.framework import meta_graph from tensorflow.python.training.summary_io import SummaryWriterCache +from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer +from tensorflow.python.training import training_util +from tensorflow.python.platform import gfile +from tensorflow.python.framework import errors_impl from easy_rec.python.utils import shape_utils +from easy_rec.python.utils import embedding_utils +from easy_rec.python.utils import constant + +try: + import kafka + from kafka import KafkaProducer, KafkaAdminClient + from kafka.admin import NewTopic +except ImportError as ex: + logging.warning('kafka-python is not installed: %s' % str(ex)) + if tf.__version__ >= '2.0': tf = tf.compat.v1 @@ -111,10 +129,10 @@ def _check_flag_file(is_chief, flag_file): logging.info('_check_flag_file: is_chief = %d flag_file=%s' % (is_chief, flag_file)) if is_chief: - with tf.gfile.GFile(flag_file, 'w') as fout: + with gfile.GFile(flag_file, 'w') as fout: fout.write('atexit time: %d' % int(time.time())) else: - while not tf.gfile.Exists(flag_file): + while not gfile.Exists(flag_file): time.sleep(1) from atexit import register @@ -208,10 +226,10 @@ def _check_flag_file(is_chief, flag_file): logging.info('_check_flag_file: is_chief = %d flag_file=%s' % (is_chief, flag_file)) if is_chief: - with tf.gfile.GFile(flag_file, 'w') as fout: + with gfile.GFile(flag_file, 'w') as fout: fout.write('atexit time: %d' % int(time.time())) else: - while not tf.gfile.Exists(flag_file): + while not gfile.Exists(flag_file): time.sleep(1) from atexit import register @@ -235,7 +253,7 @@ def __init__(self, num_steps, filename, is_chief): self._num_steps = num_steps self._is_chief = is_chief if self._is_chief: - self._progress_file = tf.gfile.GFile(filename, 'w') + self._progress_file = gfile.GFile(filename, 'w') self._progress_file.write('0.00\n') self._progress_interval = 0.01 # 1% self._last_progress_cnt = 0 @@ -276,7 +294,9 @@ def __init__(self, checkpoint_basename='model.ckpt', scaffold=None, listeners=None, - write_graph=True): + write_graph=True, + data_offset_var=None, + increment_save_config=None): """Initializes a `CheckpointSaverHook`. Args: @@ -290,6 +310,8 @@ def __init__(self, Used for callbacks that run immediately before or after this hook saves the checkpoint. write_graph: whether to save graph.pbtxt. + data_offset_var: data offset variable. + increment_save_config: parameters for saving increment checkpoints. Raises: ValueError: One of `save_steps` or `save_secs` should be set. @@ -304,6 +326,61 @@ def __init__(self, scaffold=scaffold, listeners=listeners) self._write_graph = write_graph + self._data_offset_var = data_offset_var + + if increment_save_config is not None: + self._dense_name_to_ids = embedding_utils.get_dense_name_to_ids() + self._sparse_name_to_ids = embedding_utils.get_sparse_name_to_ids() + + with gfile.GFile(os.path.join(checkpoint_dir, constant.DENSE_UPDATE_VARIABLES), + 'w') as fout: + json.dump(self._dense_name_to_ids, fout, indent=2) + + save_secs = increment_save_config.dense_save_secs + save_steps = increment_save_config.dense_save_steps + self._dense_timer = SecondOrStepTimer(every_secs=save_secs if save_secs > 0 else None, + every_steps=save_steps if save_steps > 0 else None) + save_secs = increment_save_config.sparse_save_secs + save_steps = increment_save_config.sparse_save_steps + self._sparse_timer = SecondOrStepTimer(every_secs=save_secs if save_secs > 0 else None, + every_steps=save_steps if save_steps > 0 else None) + + self._dense_timer.update_last_triggered_step(0) + self._sparse_timer.update_last_triggered_step(0) + + self._sparse_indices = [] + self._sparse_values = [] + sparse_train_vars = ops.get_collection(constant.SPARSE_UPDATE_VARIABLES) + for sparse_var, indice_dtype in sparse_train_vars: + with ops.control_dependencies([tf.train.get_global_step()]): + with ops.colocate_with(sparse_var): + sparse_indice = get_sparse_indices(var_name=sparse_var.op.name, ktype=indice_dtype) + sparse_indice = sparse_indice.global_indices + self._sparse_indices.append(sparse_indice) + if 'EmbeddingVariable' in str(type(sparse_var)): + self._sparse_values.append(sparse_var.sparse_read(sparse_indice)) + else: + self._sparse_values.append(array_ops.gather(sparse_var, sparse_indice)) + if increment_save_config.HasField('kafka'): + self._topic = increment_save_config.kafka.topic + logging.info('increment save topic: %s' % self._topic) + + admin_clt = KafkaAdminClient(bootstrap_servers=increment_save_config.kafka.server) + if self._topic not in admin_clt.list_topics(): + admin_clt.create_topics(new_topics=[NewTopic(name=self._topic, + num_partitions=1, replication_factor=1, + topic_configs={'max.message.bytes':1024 * 1024 * 1024})], validate_only=False) + logging.info('create increment save topic: %s' % self._topic) + admin_clt.close() + + servers = increment_save_config.kafka.server.split(',') + self._kafka_producer = KafkaProducer(bootstrap_servers=servers, + max_request_size=1024 * 1024 * 64) + else: + self._kafka_producer = None + else: + self._dense_timer = None + self._sparse_timer = None def after_create_session(self, session, coord): global_step = session.run(self._global_step_tensor) @@ -319,16 +396,93 @@ def after_create_session(self, session, coord): graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def) self._summary_writer.add_graph(graph) self._summary_writer.add_meta_graph(meta_graph_def) - # when tf version > 1.10.0, we use defaut training strategy, which saves ckpt - # at first train step - if LooseVersion(tf.__version__) >= LooseVersion('1.10.0'): - # The checkpoint saved here is the state at step "global_step". - self._save(session, global_step) + + # save for step 0 + self._save(session, global_step) + self._timer.update_last_triggered_step(global_step) def before_run(self, run_context): # pylint: disable=unused-argument return tf.train.SessionRunArgs(self._global_step_tensor) + def _send_dense(self, global_step, session): + dense_train_vars = ops.get_collection(constant.DENSE_UPDATE_VARIABLES) + dense_train_vals = session.run(dense_train_vars) + logging.info("global_step=%d, increment save dense variables" % global_step) + + msg_num = len(dense_train_vals) + msg_ids = [ self._dense_name_to_ids[x.op.name] for x in dense_train_vars] + # 0 mean dense update message + msg_header = [0, msg_num, global_step] + for msg_id, x in zip(msg_ids, dense_train_vals): + msg_header.append(msg_id) + msg_header.append(x.size) + bytes_buf = np.array(msg_header, dtype=np.int32).tobytes() + for x in dense_train_vals: + bytes_buf += x.tobytes() + if self._kafka_producer is not None: + msg_key = 'dense_update_%d' % global_step + send_res = self._kafka_producer.send(self._topic, bytes_buf, key=msg_key.encode('utf-8')) + logging.info('kafka send dense: %d exception: %s' % (global_step, send_res.exception)) + logging.info("global_step=%d, increment update dense variables, msg_num=%d" \ + % (global_step, msg_num)) + + def _send_sparse(self, global_step, session): + sparse_train_vars = ops.get_collection(constant.SPARSE_UPDATE_VARIABLES) + sparse_res = session.run(self._sparse_indices + self._sparse_values) + msg_num = int(len(sparse_res) / 2) + + sel_ids = [ i for i in range(msg_num) if len(sparse_res[i]) > 0 ] + sparse_key_res = [ sparse_res[i] for i in sel_ids ] + sparse_val_res = [ sparse_res[i+msg_num] for i in sel_ids ] + sparse_train_vars = [ sparse_train_vars[i][0] for i in sel_ids ] + + embed_ids = [ self._sparse_name_to_ids[x.name] for x in sparse_train_vars] + + msg_num = len(sel_ids) + + if msg_num == 0: + logging.warning('there are no sparse updates, will skip this send: %d' % global_step) + return + + + # 1 means sparse update messages + msg_header = [1, msg_num, global_step] + for i, x in enumerate(embed_ids): + msg_header.append(x) + msg_header.append(len(sparse_res[sel_ids[i]])) + bytes_buf = np.array(msg_header, dtype=np.int32).tobytes() + for tmp_id, tmp_key, tmp_val, tmp_var in zip(embed_ids, sparse_key_res, + sparse_val_res, sparse_train_vars): + # for non kv embedding variables, add partition offset to tmp_key + if 'EmbeddingVariable' not in str(type(tmp_var)): + if tmp_var._save_slice_info is not None: + tmp_key += tmp_var._save_slice_info.var_offset[0] + bytes_buf += tmp_key.tobytes() + bytes_buf += tmp_val.tobytes() + if self._kafka_producer is not None: + msg_key = 'sparse_update_%d' % global_step + send_res = self._kafka_producer.send(self._topic, bytes_buf, key=msg_key.encode('utf-8')) + logging.info('kafka send sparse: %d %s' % (global_step, send_res.exception)) + logging.info("global_step=%d, increment update sparse variables, msg_num=%d, msg_size=%d" \ + % (global_step, msg_num, len(bytes_buf))) + + def after_run(self, run_context, run_values): + super(CheckpointSaverHook, self).after_run(run_context, run_values) + stale_global_step = run_values.results + global_step = -1 + if self._dense_timer is not None and self._dense_timer.should_trigger_for_step(stale_global_step + self._steps_per_run): + global_step = run_context.session.run(self._global_step_tensor) + self._dense_timer.update_last_triggered_step(global_step) + self._send_dense(global_step, run_context.session) + + if self._sparse_timer is not None and self._sparse_timer.should_trigger_for_step(stale_global_step + self._steps_per_run): + if global_step < 0: + global_step = run_context.session.run(self._global_step_tensor) + + self._sparse_timer.update_last_triggered_step(global_step) + self._send_sparse(global_step, run_context.session) + def _save(self, session, step): """Saves the latest checkpoint, returns should_stop.""" logging.info('Saving checkpoints for %d into %s.', step, self._save_path) @@ -343,6 +497,16 @@ def _save(self, session, step): write_meta_graph=self._write_graph) save_dir, save_name = os.path.split(self._save_path) + if self._data_offset_var is not None: + save_data_offset = session.run(self._data_offset_var) + data_offset_json = {} + for x in save_data_offset: + if x : + data_offset_json.update(json.loads(x)) + save_offset_path = os.path.join(save_dir, 'model.ckpt-%d.offset' % step) + with gfile.GFile(save_offset_path, 'w') as fout: + json.dump(data_offset_json, fout) + self._summary_writer.add_session_log( tf.SessionLog( status=tf.SessionLog.CHECKPOINT, checkpoint_path=self._save_path), @@ -357,6 +521,18 @@ def _save(self, session, step): should_stop = True return should_stop + def end(self, session): + super(CheckpointSaverHook, self).end(session) + global_step = session.run(self._global_step_tensor) + if self._dense_timer is not None and \ + global_step != self._dense_timer.last_triggered_step(): + self._dense_timer.update_last_triggered_step(global_step) + self._send_dense(global_step, session) + if self._sparse_timer is not None and \ + global_step != self._sparse_timer.last_triggered_step(): + self._sparse_timer.update_last_triggered_step(global_step) + self._send_sparse(global_step, session) + class NumpyCheckpointRestoreHook(SessionRunHook): """Restore variable from numpy checkpoint.""" @@ -395,7 +571,7 @@ def begin(self): vars_not_inited[var_name] = ','.join([str(s) for s in var_shape]) self._restore_op = tf.group(assign_ops) - with tf.gfile.GFile(self._ckpt_path[:-4] + '_not_inited.txt', 'w') as f: + with gfile.GFile(self._ckpt_path[:-4] + '_not_inited.txt', 'w') as f: for var_name in sorted(vars_not_inited.keys()): f.write('%s:%s\n' % (var_name, vars_not_inited[var_name])) assert not has_shape_unmatch, 'exist variable shape not match, restore failed' @@ -491,6 +667,40 @@ def after_create_session(self, session, coord): logging.info('restore checkpoint from %s' % ckpt_path) saver.restore(session, ckpt_path) +class OssStopSignalHook(SessionRunHook): + def __init__(self, model_dir, secs_interval=60, step_interval=10): + self._stop_sig_file = os.path.join(model_dir, 'OSS_STOP_SIGNAL') + self._stop = False + self._check_stop = False + self._last_chk_step = 0 + self._curr_step = 0 + def _check_stop(): + while self._check_stop: + if self._curr_step < self._last_chk_step + step_interval: + time.sleep(1) + continue + self._last_chk_step = self._curr_step + if gfile.Exists(self._stop_sig_file): + self._stop = True + logging.info('OssStopSignalHook: stop on signal %s' % self._stop_sig_file) + break + time.sleep(secs_interval) + self._th = threading.Thread(target=_check_stop) + self._th.start() + + def before_run(self, run_context): + if self._stop: + run_context.request_stop() + self._global_step_tensor = training_util._get_or_create_global_step_read() + return tf.train.SessionRunArgs(self._global_step_tensor) + + def after_run(self, run_context, run_values): + self._curr_step = run_values.results + + def end(self, session): + self._check_stop = True + self._th.join() + class OnlineEvaluationHook(SessionRunHook): @@ -516,7 +726,7 @@ def end(self, session): eval_result_file = os.path.join(self._output_dir, 'online_eval_result.txt-%s' % global_step) logging.info('Saving online eval result to file %s' % eval_result_file) - with tf.gfile.GFile(eval_result_file, 'w') as ofile: + with gfile.GFile(eval_result_file, 'w') as ofile: result_to_write = {} for key in sorted(metric_value_dict): # convert numpy float to python float @@ -580,7 +790,11 @@ def latest_checkpoint(model_dir): Return: model_path: xx/model.ckpt-2000 """ - ckpt_metas = tf.gfile.Glob(os.path.join(model_dir, 'model.ckpt-*.meta')) + try: + ckpt_metas = gfile.Glob(os.path.join(model_dir, 'model.ckpt-*.meta')) + except errors_impl.NotFoundError as ex: + return None + if len(ckpt_metas) == 0: return None diff --git a/easy_rec/python/utils/export_big_model.py b/easy_rec/python/utils/export_big_model.py index 248d6d021..e89b77043 100644 --- a/easy_rec/python/utils/export_big_model.py +++ b/easy_rec/python/utils/export_big_model.py @@ -6,6 +6,7 @@ import time import numpy as np +from google.protobuf import json_format import tensorflow as tf from tensorflow.core.protobuf import config_pb2 from tensorflow.python.framework import ops @@ -18,12 +19,15 @@ from tensorflow.python.training.device_setter import replica_device_setter from tensorflow.python.training.monitored_session import ChiefSessionCreator from tensorflow.python.training.saver import export_meta_graph +from tensorflow.python.training.monitored_session import Scaffold import easy_rec +import json from easy_rec.python.utils import estimator_utils from easy_rec.python.utils import io_util from easy_rec.python.utils import proto_util -from easy_rec.python.utils.meta_graph_editor import MetaGraphEditor +from easy_rec.python.utils import constant +from easy_rec.python.utils.meta_graph_editor import MetaGraphEditor, EMBEDDING_INITIALIZERS if tf.__version__ >= '2.0': from tensorflow.python.framework.ops import disable_eager_execution @@ -33,6 +37,9 @@ GPUOptions = config_pb2.GPUOptions +INCR_UPDATE_SIGNATURE_KEY = 'incr_update_sig' + + def export_big_model(export_dir, pipeline_config, redis_params, serving_input_fn, estimator, checkpoint_path, verbose): for key in redis_params: @@ -282,6 +289,7 @@ def export_big_model(export_dir, pipeline_config, redis_params, saver = tf.train.Saver() with tf.Session(target=server.target if server else '') as sess: saver.restore(sess, checkpoint_path) + builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ @@ -298,7 +306,7 @@ def export_big_model(export_dir, pipeline_config, redis_params, return -def export_big_model_to_oss(export_dir, pipeline_config, oss_params, +def export_big_model_to_oss(export_dir, pipeline_config, oss_params, serving_input_fn, estimator, checkpoint_path, verbose): for key in oss_params: @@ -489,6 +497,7 @@ def export_big_model_to_oss(export_dir, pipeline_config, oss_params, oss_timeout=oss_params.get('oss_timeout', 1500), meta_graph_def=meta_graph_def, norm_name_to_ids=norm_name_to_ids, + incr_update_params=oss_params.get('incr_save', None), debug_dir=export_dir if verbose else '') meta_graph_editor.edit_graph_for_oss() tf.reset_default_graph() @@ -500,11 +509,24 @@ def export_big_model_to_oss(export_dir, pipeline_config, oss_params, with GFile(embed_name_to_id_file, 'w') as fout: for tmp_norm_name in norm_name_to_ids: fout.write('%s\t%s\n' % (tmp_norm_name, norm_name_to_ids[tmp_norm_name])) - tf.add_to_collection( - tf.GraphKeys.ASSET_FILEPATHS, + ops.add_to_collection( + ops.GraphKeys.ASSET_FILEPATHS, tf.constant( embed_name_to_id_file, dtype=tf.string, name='embed_name_to_ids.txt')) + dense_train_vars_path = os.path.join(os.path.dirname(checkpoint_path), constant.DENSE_UPDATE_VARIABLES) + ops.add_to_collection( + ops.GraphKeys.ASSET_FILEPATHS, + tf.constant( + dense_train_vars_path, dtype=tf.string, name=constant.DENSE_UPDATE_VARIABLES)) + + kafka_params_file = os.path.join(export_dir, "kafka.txt") + with GFile(kafka_params_file, 'w') as fout: + json.dump(json.loads(json_format.MessageToJson(oss_params['incr_save']['kafka'], + preserving_proto_field_name=True)), fout, indent=2) + ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, + tf.constant(kafka_params_file, dtype=tf.string, name="kafka.txt")) + export_dir = os.path.join(export_dir, meta_graph_def.meta_info_def.meta_graph_version) export_dir = io_util.fix_oss_dir(export_dir) @@ -518,6 +540,7 @@ def export_big_model_to_oss(export_dir, pipeline_config, oss_params, tmp = graph.get_tensor_by_name(inputs[tmp_key].name) tensor_info_inputs[tmp_key] = \ tf.saved_model.utils.build_tensor_info(tmp) + tensor_info_outputs = {} for tmp_key in outputs: tmp = graph.get_tensor_by_name(outputs[tmp_key].name) @@ -529,19 +552,43 @@ def export_big_model_to_oss(export_dir, pipeline_config, oss_params, outputs=tensor_info_outputs, method_name=signature_constants.PREDICT_METHOD_NAME)) + incr_update_inputs = meta_graph_editor.sparse_update_inputs + incr_update_outputs = meta_graph_editor.sparse_update_outputs + incr_update_inputs.update(meta_graph_editor.dense_update_inputs) + incr_update_outputs.update(meta_graph_editor.dense_update_outputs) + tensor_info_incr_update_inputs = {} + tensor_info_incr_update_outputs = {} + for tmp_key in incr_update_inputs: + tmp = graph.get_tensor_by_name(incr_update_inputs[tmp_key].name) + tensor_info_incr_update_inputs[tmp_key] = \ + tf.saved_model.utils.build_tensor_info(tmp) + for tmp_key in incr_update_outputs: + tmp = graph.get_tensor_by_name(incr_update_outputs[tmp_key].name) + tensor_info_incr_update_outputs[tmp_key] = \ + tf.saved_model.utils.build_tensor_info(tmp) + incr_update_signature = ( + tf.saved_model.signature_def_utils.build_signature_def( + inputs=tensor_info_incr_update_inputs, + outputs=tensor_info_incr_update_outputs, + method_name=signature_constants.PREDICT_METHOD_NAME)) + session_config = ConfigProto( allow_soft_placement=True, log_device_placement=True) saver = tf.train.Saver() with tf.Session(target=server.target if server else '') as sess: saver.restore(sess, checkpoint_path) + main_op = tf.group([Scaffold.default_local_init_op(), + ops.get_collection(EMBEDDING_INITIALIZERS)]) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature, + INCR_UPDATE_SIGNATURE_KEY: incr_update_signature }, assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS), saver=saver, + main_op=main_op, strip_default_attrs=True, clear_devices=True) builder.save() diff --git a/easy_rec/python/utils/meta_graph_editor.py b/easy_rec/python/utils/meta_graph_editor.py index d01c6fb8e..fb5194882 100644 --- a/easy_rec/python/utils/meta_graph_editor.py +++ b/easy_rec/python/utils/meta_graph_editor.py @@ -7,8 +7,14 @@ from tensorflow.python.platform.gfile import GFile from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model.loader_impl import SavedModelLoader +from tensorflow.python.saved_model import constants +from tensorflow.python.framework import ops from easy_rec.python.utils import proto_util +from easy_rec.python.utils import embedding_utils +from easy_rec.python.utils import constant + +EMBEDDING_INITIALIZERS = 'embedding_initializers' class MetaGraphEditor: @@ -27,6 +33,7 @@ def __init__(self, oss_timeout=0, meta_graph_def=None, norm_name_to_ids=None, + incr_update_params=None, debug_dir=''): self._lookup_op = tf.load_op_library(lookup_lib_path) self._debug_dir = debug_dir @@ -73,6 +80,37 @@ def __init__(self, self._oss_ak = oss_ak self._oss_sk = oss_sk self._oss_timeout = oss_timeout + + self._kafka_params = None + if incr_update_params is not None and 'kafka' in incr_update_params: + self._kafka_params = incr_update_params['kafka'] + + self._datahub_params = None + if incr_update_params is not None and 'datahub' in incr_update_params: + self._datahub_params = incr_update_params['datahub'] + + # increment update placeholders + self._embedding_update_inputs = {} + self._embedding_update_outputs = {} + + self._dense_update_inputs = {} + self._dense_update_outputs = {} + + @property + def sparse_update_inputs(self): + return self._embedding_update_inputs + + @property + def sparse_update_outputs(self): + return self._embedding_update_outputs + + @property + def dense_update_inputs(self): + return self._dense_update_inputs + + @property + def dense_update_outputs(self): + return self._dense_update_outputs @property def graph_def(self): @@ -378,7 +416,46 @@ def add_oss_lookup_op(self, lookup_input_indices, lookup_input_values, combiners=self._embed_combiners, embedding_dims=self._embed_dims, embedding_names=self._embed_ids, - embedding_is_kv=self._embed_is_kv) + embedding_is_kv=self._embed_is_kv, + shared_name='embedding_lookup_res', + name='embedding_lookup_fused/lookup') + + lookup_init_op = self._lookup_op.oss_init( + osspath=self._oss_path, + endpoint=self._oss_endpoint, + ak=self._oss_ak, + sk=self._oss_sk, + combiners=self._embed_combiners, + embedding_dims=self._embed_dims, + embedding_names=self._embed_ids, + embedding_is_kv=self._embed_is_kv, + N=len(self._embed_is_kv), + shared_name='embedding_lookup_res', + name='embedding_lookup_fused/init') + + ops.add_to_collection(EMBEDDING_INITIALIZERS, lookup_init_op) + + if self._kafka_params: + # all sparse variables are updated by a single custom operation + message_ph = tf.placeholder(tf.int8, [None], name='incr_update/message') + embedding_update = self._lookup_op.embedding_update( + message=message_ph, + shared_name='embedding_lookup_res', + name='embedding_lookup_fused/embedding_update') + self._embedding_update_inputs['incr_update/sparse/message'] = message_ph + self._embedding_update_outputs['incr_update/sparse/embedding_update'] = embedding_update + + # dense variables are updated one by one + dense_name_to_ids = embedding_utils.get_dense_name_to_ids() + for x in ops.get_collection(constant.DENSE_UPDATE_VARIABLES): + dense_var_id = dense_name_to_ids[x.op.name] + dense_input_name = 'incr_update/dense/%d/input' % dense_var_id + dense_output_name = 'incr_update/dense/%d/output' % dense_var_id + dense_update_input = tf.placeholder(tf.float32, x.get_shape(), + name=dense_input_name) + self._dense_update_inputs[dense_input_name] = dense_update_input + dense_assign_op = tf.assign(x, dense_update_input) + self._dense_update_outputs[dense_output_name] = dense_assign_op meta_graph_def = tf.train.export_meta_graph() diff --git a/easy_rec/python/utils/multi_optimizer.py b/easy_rec/python/utils/multi_optimizer.py index 9e5cefbda..c34c4abe0 100644 --- a/easy_rec/python/utils/multi_optimizer.py +++ b/easy_rec/python/utils/multi_optimizer.py @@ -38,6 +38,9 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): update_ops.append(opt.apply_gradients(tmp, None)) return tf.group(update_ops) + def open_auto_record(self, flag=True): + super(MultiOptimizer, self).open_auto_record(flag) + def get_slot(self, var, name): raise NotImplementedError('not implemented') # for opt in self._opts: diff --git a/easy_rec/python/utils/test_utils.py b/easy_rec/python/utils/test_utils.py index 104f4faa0..74a8158c7 100644 --- a/easy_rec/python/utils/test_utils.py +++ b/easy_rec/python/utils/test_utils.py @@ -150,9 +150,9 @@ def _load_config_for_test(pipeline_config_path, test_dir, total_steps=50): def test_datahub_train_eval(pipeline_config_path, + odps_oss_config, test_dir, process_pipeline_func=None, - hyperparam_str='', total_steps=50, post_check_func=None): gpus = get_available_gpus() @@ -175,13 +175,26 @@ def test_datahub_train_eval(pipeline_config_path, pipeline_config.train_config.train_distribute = 0 pipeline_config.train_config.num_gpus_per_worker = 1 pipeline_config.train_config.sync_replicas = False + + pipeline_config.datahub_train_input.akId = odps_oss_config.dh_id + pipeline_config.datahub_train_input.akSecret = odps_oss_config.dh_key + pipeline_config.datahub_train_input.region = odps_oss_config.dh_endpoint + pipeline_config.datahub_train_input.project = odps_oss_config.dh_project + pipeline_config.datahub_train_input.topic = odps_oss_config.dh_topic + + pipeline_config.datahub_eval_input.akId = odps_oss_config.dh_id + pipeline_config.datahub_eval_input.akSecret = odps_oss_config.dh_key + pipeline_config.datahub_eval_input.region = odps_oss_config.dh_endpoint + pipeline_config.datahub_eval_input.project = odps_oss_config.dh_project + pipeline_config.datahub_eval_input.topic = odps_oss_config.dh_topic + if process_pipeline_func is not None: assert callable(process_pipeline_func) pipeline_config = process_pipeline_func(pipeline_config) config_util.save_pipeline_config(pipeline_config, test_dir) test_pipeline_config_path = os.path.join(test_dir, 'pipeline.config') - train_cmd = 'python3 -m easy_rec.python.train_eval --pipeline_config_path %s %s' % ( - test_pipeline_config_path, hyperparam_str) + train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path %s' % \ + test_pipeline_config_path proc = run_cmd(train_cmd, '%s/log_%s.txt' % (test_dir, 'master')) proc.wait() if proc.returncode != 0: diff --git a/pai_jobs/deploy.sh b/pai_jobs/deploy.sh index e1f10f6f5..385669119 100755 --- a/pai_jobs/deploy.sh +++ b/pai_jobs/deploy.sh @@ -92,6 +92,16 @@ fi cp easy_rec/__init__.py easy_rec/__init__.py.bak sed -i -e "s/\[VERSION\]/$VERSION/g" easy_rec/__init__.py find -L easy_rec -name "*.pyc" | xargs rm -rf + +if [ ! -d "datahub" ] +then + wget http://easyrec.oss-cn-beijing.aliyuncs.com/third_party/pydatahub.tar.gz + if [ $? -ne 0 ] + then + echo "datahub download failed." + fi + tar -zvxf pydatahub.tar.gz +fi tar -cvzhf $RES_PATH easy_rec run.py mv easy_rec/__init__.py.bak easy_rec/__init__.py diff --git a/pai_jobs/deploy_ext.sh b/pai_jobs/deploy_ext.sh index 8426596d5..142e45880 100755 --- a/pai_jobs/deploy_ext.sh +++ b/pai_jobs/deploy_ext.sh @@ -92,7 +92,19 @@ fi cp -R $root_dir/easy_rec ./easy_rec sed -i -e "s/\[VERSION\]/$VERSION/g" easy_rec/__init__.py find -L easy_rec -name "*.pyc" | xargs rm -rf -tar -cvzhf $RES_PATH easy_rec run.py + +if [ ! -d "datahub" ] +then + wget http://easyrec.oss-cn-beijing.aliyuncs.com/third_party/pydatahub.tar.gz + if [ $? -ne 0 ] + then + echo "datahub download failed." + fi + tar -zvxf pydatahub.tar.gz + rm -rf pydatahub.tar.gz +fi + +tar -cvzhf $RES_PATH easy_rec datahub lz4 cprotobuf run.py # 2 means generate only if [ $mode -ne 2 ] diff --git a/pai_jobs/easy_rec_flow/easy_rec.xml b/pai_jobs/easy_rec_flow/easy_rec.xml index d9dc17ddc..4c04c52b0 100644 --- a/pai_jobs/easy_rec_flow/easy_rec.xml +++ b/pai_jobs/easy_rec_flow/easy_rec.xml @@ -56,7 +56,7 @@ - + diff --git a/pai_jobs/run.py b/pai_jobs/run.py index e9f768149..c273ea64b 100644 --- a/pai_jobs/run.py +++ b/pai_jobs/run.py @@ -165,6 +165,8 @@ 'hyperparameter save metric path') tf.app.flags.DEFINE_string('asset_files', None, 'extra files to add to export') +tf.app.flags.DEFINE_bool('online', False, 'for online training') + FLAGS = tf.app.flags.FLAGS @@ -238,18 +240,21 @@ def main(argv): ) >= 2, 'at least 2 tables must be specified, but only[%d]: %s' % ( len(tables), FLAGS.tables) - if FLAGS.train_tables: - pipeline_config.train_input_path = FLAGS.train_tables - else: - pipeline_config.train_input_path = FLAGS.tables.split(',')[0] + if not FLAGS.online: + if FLAGS.train_tables: + pipeline_config.train_input_path = FLAGS.train_tables + else: + pipeline_config.train_input_path = FLAGS.tables.split(',')[0] - if FLAGS.eval_tables: - pipeline_config.eval_input_path = FLAGS.eval_tables - else: - pipeline_config.eval_input_path = FLAGS.tables.split(',')[1] + if FLAGS.eval_tables: + pipeline_config.eval_input_path = FLAGS.eval_tables + else: + pipeline_config.eval_input_path = FLAGS.tables.split(',')[1] - print('[run.py] train_tables: %s' % pipeline_config.train_input_path) - print('[run.py] eval_tables: %s' % pipeline_config.eval_input_path) + print('[run.py] train_tables: %s' % pipeline_config.train_input_path) + print('[run.py] eval_tables: %s' % pipeline_config.eval_input_path) + else: + print('[run.py] online training is enabled.') if pipeline_config.fg_json_path: fg_util.load_fg_json_to_config(pipeline_config) @@ -265,9 +270,13 @@ def main(argv): if FLAGS.sampler_table: pipeline_config.data_config.negative_sampler.input_path = FLAGS.sampler_table - # parse selected_cols - set_selected_cols(pipeline_config, FLAGS.selected_cols, FLAGS.all_cols, - FLAGS.all_col_types) + if not FLAGS.online: + # parse selected_cols + set_selected_cols(pipeline_config, FLAGS.selected_cols, FLAGS.all_cols, + FLAGS.all_col_types) + else: + pipeline_config.data_config.selected_cols = '' + pipeline_config.data_config.selected_col_types = '' distribute_strategy = DistributionStrategyMap[FLAGS.distribute_strategy] @@ -331,9 +340,14 @@ def main(argv): set_distribution_config(pipeline_config, num_worker, num_gpus_per_worker, distribute_strategy) - # parse selected_cols - set_selected_cols(pipeline_config, FLAGS.selected_cols, FLAGS.all_cols, - FLAGS.all_col_types) + if not FLAGS.online: + # parse selected_cols + set_selected_cols(pipeline_config, FLAGS.selected_cols, FLAGS.all_cols, + FLAGS.all_col_types) + else: + pipeline_config.data_config.selected_cols = '' + pipeline_config.data_config.selected_col_types = '' + if FLAGS.distribute_eval: easy_rec.distribute_evaluate(pipeline_config, FLAGS.checkpoint_path, None, FLAGS.eval_result_path) diff --git a/processor/dataset.config b/processor/dataset.config new file mode 100644 index 000000000..279175c2a --- /dev/null +++ b/processor/dataset.config @@ -0,0 +1,67 @@ + batch_size: 1024 + label_fields: "clk" + input_type: RTPInput + separator: "" + selected_cols: "0,3" + input_fields { + input_name: "clk" + input_type: INT32 + default_val: "0" + } + input_fields { + input_name: "user_id" + } + input_fields { + input_name: "cms_segid" + } + input_fields { + input_name: "cms_group_id" + } + input_fields { + input_name: "age_level" + } + input_fields { + input_name: "pvalue_level" + } + input_fields { + input_name: "shopping_level" + } + input_fields { + input_name: "occupation" + } + input_fields { + input_name: "new_user_class_level" + } + input_fields { + input_name: "adgroup_id" + } + input_fields { + input_name: "cate_id" + } + input_fields { + input_name: "campaign_id" + } + input_fields { + input_name: "customer" + } + input_fields { + input_name: "brand" + } + input_fields { + input_name: "price" + input_type: DOUBLE + default_val: "0.0" + } + input_fields { + input_name: "pid" + } + input_fields { + input_name: "user_tag_cate" + } + input_fields { + input_name: "combo_brand" + } + input_fields { + input_name: "combo_cate_id" + } + rtp_separator: ";" diff --git a/processor/test.py b/processor/test.py new file mode 100644 index 000000000..bcbd242dd --- /dev/null +++ b/processor/test.py @@ -0,0 +1,123 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import argparse +import subprocess +import ctypes +import tf_predict_pb2 +import dataset_pb2 +from google.protobuf import text_format +import time +import glob +import json +import os +import logging + +logging.basicConfig( + level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s') + +PROCESSOR_VERSION = 'LaRec-0.9.5a-a91ba55-Linux' +PROCESSOR_FILE = PROCESSOR_VERSION + '.tar.gz' +PROCESSOR_URL = 'http://easyrec.oss-cn-beijing.aliyuncs.com/deploy/' + PROCESSOR_FILE +PROCESSOR_ENTRY_LIB = 'processor/' + PROCESSOR_VERSION + '/larec/libtf_predictor.so' + +def build_array_proto(array_proto, data, dtype): + array_proto.array_shape.dim.append(len(data)) + + if dtype == dataset_pb2.DatasetConfig.STRING: + array_proto.string_val.extend([ x.encode('utf-8') for x in data ]) + array_proto.dtype = tf_predict_pb2.DT_STRING + elif dtype == dataset_pb2.DatasetConfig.FLOAT: + array_proto.float_val.extend([ float(x) for x in data]) + array_proto.dtype = tf_predict_pb2.DT_FLOAT + elif dtype == dataset_pb2.DatasetConfig.DOUBLE: + array_proto.double_val.extend([ float(x) for x in data]) + array_proto.dtype = tf_predict_pb2.DT_DOUBLE + elif dtype == dataset_pb2.DatasetConfig.INT32: + array_proto.int_val.extend([ int(x) for x in data]) + array_proto.dtype = tf_predict_pb2.DT_INT32 + elif dtype == dataset_pb2.DatasetConfig.INT64: + array_proto.int64_val.extend([ np.int64(x) for x in data]) + array_proto.dtype = tf_predict_pb2.DT_INT64 + else: + assert False, 'invalid datatype[%s]' % str(dtype) + return array_proto + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_config', type=str, default=None, help='dataset_config') + parser.add_argument('--input_path', type=str, default=None, help='input data path') + parser.add_argument('--output_path', type=str, default=None, help='output data path') + parser.add_argument('--libc_path', type=str, default='/lib64/libc.so.6', help='libc.so.6 path') + parser.add_argument('--saved_model_dir', type=str, default=None, help='saved model directory') + args = parser.parse_args() + + if not os.path.exists(PROCESSOR_ENTRY_LIB): + if not os.path.exists('processor/' + PROCESSOR_FILE): + subprocess.check_output('wget %s -O processor/%s' % (PROCESSOR_URL, + PROCESSOR_FILE), shell=True) + subprocess.check_output('cd processor && tar -zvxf %s' % PROCESSOR_FILE, + shell=True) + assert os.path.exists(PROCESSOR_ENTRY_LIB), 'invalid processor path: %s'\ + % PROCESSOR_ENTRY_LIB + + assert os.path.exists(args.libc_path), '%s does not exist' % args.libc_path + assert args.saved_model_dir is not None and os.path.isdir(args.saved_model_dir),\ + '%s is not a valid directory' % args.saved_model_dir + assert args.data_config is not None and os.path.exists(args.data_config),\ + '%s does not exist' % args.data_config + assert args.input_path is not None and os.path.exists(args.input_path),\ + '%s does not exist' % args.input_path + assert args.output_path is not None, 'output_path is not set' + + data_config = dataset_pb2.DatasetConfig() + with open(args.data_config) as fin: + config_str = fin.read() + text_format.Merge(config_str, data_config) + + input_fields = [ [] for x in data_config.input_fields if x.input_name\ + not in data_config.label_fields ] + + with open(args.input_path, 'r') as fin: + for line_str in fin: + line_str = line_str.strip() + line_toks = line_str.split(data_config.rtp_separator)[-1].split(chr(2)) + for i, tok in enumerate(line_toks): + input_fields[i].append(tok) + + req = tf_predict_pb2.PredictRequest() + req.signature_name = 'serving_default' + for i in range(len(input_fields)): + build_array_proto(req.inputs[data_config.input_fields[i+1].input_name], + input_fields[i], data_config.input_fields[i+1].input_type) + + tf_predictor = ctypes.cdll.LoadLibrary(PROCESSOR_ENTRY_LIB) + tf_predictor.saved_model_init.restype=ctypes.c_void_p + handle=tf_predictor.saved_model_init(args.saved_model_dir.encode('utf-8')) + logging.info('saved_model handle=%d' % handle) + + sparse_step = ctypes.c_int(0) + dense_step = ctypes.c_int(0) + while sparse_step.value < 20 or dense_step.value < 20: + tf_predictor.saved_model_step(ctypes.c_void_p(handle), ctypes.byref(sparse_step), + ctypes.byref(dense_step)) + time.sleep(1) + + data_bin = req.SerializeToString() + tf_predictor.saved_model_predict.restype=ctypes.c_void_p + out_len = ctypes.c_int(0) + res_p = tf_predictor.saved_model_predict(ctypes.c_void_p(handle), data_bin, + ctypes.c_int32(len(data_bin)), ctypes.byref(out_len)) + res_bytes = bytearray(ctypes.string_at(res_p, out_len)) + res = tf_predict_pb2.PredictResponse() + res.ParseFromString(res_bytes) + + with open(args.output_path, 'w') as fout: + logits = res.outputs['logits'].float_val + probs = res.outputs['probs'].float_val + for logit, prob in zip(logits, probs): + fout.write(json.dumps({'logits': logit, 'probs': prob}) + '\n') + + # free memory + tf_predictor.saved_model_release(ctypes.c_void_p(handle)) + libc = ctypes.cdll.LoadLibrary(args.libc_path) + libc.free(ctypes.c_void_p(res_p)) diff --git a/samples/dh_script/configs/deepfm.config b/samples/dh_script/configs/deepfm.config index 9902eb2ef..2bf2dba39 100644 --- a/samples/dh_script/configs/deepfm.config +++ b/samples/dh_script/configs/deepfm.config @@ -14,7 +14,7 @@ train_config { } use_moving_average: false } - log_step_count_steps: 200L + log_step_count_steps: 200 sync_replicas: true } @@ -30,8 +30,6 @@ datahub_train_input{ region:"{DH_REG}" project:"{DH_PRO}" topic:"{DH_TOPIC}" - shard_num:3 - life_cycle:7 } datahub_eval_input{ @@ -40,8 +38,6 @@ datahub_eval_input{ region:"{DH_REG}" project:"{DH_PRO}" topic:"{DH_TOPIC}" - shard_num:3 - life_cycle:7 } data_config { diff --git a/samples/dh_script/configs/dh.config b/samples/dh_script/configs/dh.config deleted file mode 100644 index 1aa936bab..000000000 --- a/samples/dh_script/configs/dh.config +++ /dev/null @@ -1,6 +0,0 @@ -[datahub] -access_id= -access_key= -endpoint=https://dh-cn-beijing.aliyuncs.com -topic_name=pf_test -project=tmf_easy diff --git a/samples/model_config/deepfm_combo_avazu_kafka.config b/samples/model_config/deepfm_combo_avazu_kafka.config new file mode 100644 index 000000000..43746f495 --- /dev/null +++ b/samples/model_config/deepfm_combo_avazu_kafka.config @@ -0,0 +1,386 @@ +# data/test/dwd_avazu_ctr_deepmodel_10w.csv + +kafka_train_input { + server: '127.0.0.1:9092' + topic: 'kafka_op_test_topic' + group: 'kafka_train' + offset_info: '{"0": 5, "1": 10}' +} + +kafka_eval_input { + server: '127.0.0.1:9092' + topic: 'kafka_op_test_topic' + group: 'kafka_test' + offset_info: '{"0":20, "1":30}' +} + +model_dir: "experiments/dwd_avazu_out_test_combo_kafka" + +train_config { + log_step_count_steps: 200 + # fine_tune_checkpoint: "" + optimizer_config: { + adam_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 10000 + decay_factor: 0.5 + min_learning_rate: 0.0000001 + } + } + } + use_moving_average: false + } + + sync_replicas: true + save_checkpoints_steps: 500 + num_steps: 1000 +} + +eval_config { + metrics_set: { + auc {} + } +} + +data_config { + separator: "," + input_fields: { + input_name: "label" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "hour" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "c1" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "banner_pos" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "site_id" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "site_domain" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "site_category" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "app_id" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "app_domain" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "app_category" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "device_id" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "device_ip" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "device_model" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "device_type" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "device_conn_type" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c14" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c15" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c16" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c17" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c18" + input_type: STRING + default_val:"0" + } + input_fields: { + input_name: "c19" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "c20" + input_type: INT64 + default_val:"0" + } + input_fields: { + input_name: "c21" + input_type: INT64 + default_val:"0" + } + label_fields: "label" + + batch_size: 1024 + prefetch_size: 32 + input_type: KafkaInput +} + +feature_config: { + features: { + input_names: "hour" + feature_type: IdFeature + num_buckets: 24 + embedding_dim: 16 + } + features: { + input_names: "c1" + feature_type: RawFeature + boundaries: [1000.0,1001.0,1002.0,1003.0,1004.0,1005.0,1006.0,1007.0,1008.0,1009.0,1010.0,1011.0,1012.0,1013.0,1014.0,1015.0] + embedding_dim: 16 + } + features: { + input_names: "banner_pos" + feature_type: RawFeature + boundaries: [1,2,3,4,5,6] + embedding_dim: 16 + } + features: { + input_names: "site_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: "site_domain" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: "site_category" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: "app_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: "app_domain" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 1000 + } + features: { + input_names: "app_category" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: "device_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: "device_ip" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: "device_model" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: "device_type" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: "device_conn_type" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: "c14" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 500 + } + features: { + input_names: "c15" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 500 + } + features: { + input_names: "c16" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 500 + } + features: { + input_names: "c17" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 500 + } + features: { + input_names: "c18" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 500 + } + features: { + input_names: "c19" + feature_type: RawFeature + boundaries: [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190] + embedding_dim: 16 + } + features: { + input_names: "c20" + feature_type: RawFeature + boundaries: [100.0,200.0,300.0,400.0,500.0,600.0,700.0,800.0, 900.0, 1000.0,1100.0,1200.0, 1300.0,1400.0] + embedding_dim: 16 + } + features: { + input_names: "c21" + feature_type: RawFeature + boundaries: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25] + embedding_dim: 16 + } + features: { + input_names: ["site_id", "app_id"] + feature_name: "site_id_app_id" + feature_type: ComboFeature + hash_bucket_size: 1000, + embedding_dim: 16 + } + +} +model_config:{ + model_class: "DeepFM" + feature_groups: { + group_name: "deep" + feature_names: "hour" + feature_names: "c1" + feature_names: "banner_pos" + feature_names: "site_id" + feature_names: "site_domain" + feature_names: "site_category" + feature_names: "app_id" + feature_names: "app_domain" + feature_names: "app_category" + feature_names: "device_id" + feature_names: "device_ip" + feature_names: "device_model" + feature_names: "device_type" + feature_names: "device_conn_type" + feature_names: "c14" + feature_names: "c15" + feature_names: "c16" + feature_names: "c17" + feature_names: "c18" + feature_names: "c19" + feature_names: "c20" + feature_names: "c21" + feature_names: "site_id_app_id" + wide_deep:DEEP + } + feature_groups: { + group_name: "wide" + feature_names: "hour" + feature_names: "c1" + feature_names: "banner_pos" + feature_names: "site_id" + feature_names: "site_domain" + feature_names: "site_category" + feature_names: "app_id" + feature_names: "app_domain" + feature_names: "app_category" + feature_names: "device_id" + feature_names: "device_ip" + feature_names: "device_model" + feature_names: "device_type" + feature_names: "device_conn_type" + feature_names: "c14" + feature_names: "c15" + feature_names: "c16" + feature_names: "c17" + feature_names: "c18" + feature_names: "c19" + feature_names: "c20" + feature_names: "c21" + wide_deep:WIDE + } + + deepfm { + wide_output_dim: 16 + + dnn { + hidden_units: [128, 64, 32] + } + + final_dnn { + hidden_units: [128, 64] + } + l2_regularization: 1e-5 + } + embedding_regularization: 1e-7 +} + +export_config { + multi_placeholder: false +} diff --git a/samples/model_config/deepfm_combo_on_avazu_ctr.config b/samples/model_config/deepfm_combo_on_avazu_ctr.config index 4d637c62c..25882a317 100644 --- a/samples/model_config/deepfm_combo_on_avazu_ctr.config +++ b/samples/model_config/deepfm_combo_on_avazu_ctr.config @@ -364,7 +364,7 @@ model_config:{ } l2_regularization: 1e-5 } - embedding_regularization: 1e-7 + # embedding_regularization: 1e-7 } export_config { diff --git a/samples/model_config/taobao_fg_incr_save.config b/samples/model_config/taobao_fg_incr_save.config new file mode 100644 index 000000000..cfdb010a8 --- /dev/null +++ b/samples/model_config/taobao_fg_incr_save.config @@ -0,0 +1,312 @@ +train_input_path: "data/test/rtp/taobao_train_feature.txt" +eval_input_path: "data/test/rtp/taobao_test_feature.txt" +model_dir: "experiments/taobao_fg_incr_save" + +train_config { + optimizer_config { + use_moving_average: false + momentum_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.0001 + decay_steps: 100000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + } + num_steps: 20 + sync_replicas: true + log_step_count_steps: 200 + save_checkpoints_steps: 50 + + incr_save_config { + dense_save_steps: 10 + sparse_save_steps: 10 + kafka { + server: '127.0.0.1:9092' + topic: 'kafka_model_20220408' + consumer { + offset:0 + } + } + } + + # enable_oss_stop_signal: true +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 1024 + label_fields: "clk" + input_type: RTPInput + separator: "" + selected_cols: "0,3" + input_fields { + input_name: "clk" + input_type: INT32 + default_val: "0" + } + input_fields { + input_name: "user_id" + } + input_fields { + input_name: "cms_segid" + } + input_fields { + input_name: "cms_group_id" + } + input_fields { + input_name: "age_level" + } + input_fields { + input_name: "pvalue_level" + } + input_fields { + input_name: "shopping_level" + } + input_fields { + input_name: "occupation" + } + input_fields { + input_name: "new_user_class_level" + } + input_fields { + input_name: "adgroup_id" + } + input_fields { + input_name: "cate_id" + } + input_fields { + input_name: "campaign_id" + } + input_fields { + input_name: "customer" + } + input_fields { + input_name: "brand" + } + input_fields { + input_name: "price" + input_type: DOUBLE + default_val: "0.0" + } + input_fields { + input_name: "pid" + } + input_fields { + input_name: "user_tag_cate" + } + input_fields { + input_name: "combo_brand" + } + input_fields { + input_name: "combo_cate_id" + } + rtp_separator: ";" +} +feature_config: { + features { + input_names: "user_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + max_partitions: 4 + separator: "" + } + features { + input_names: "cms_segid" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100 + separator: "" + } + features { + input_names: "cms_group_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100 + separator: "" + } + features { + input_names: "age_level" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10 + separator: "" + } + features { + input_names: "pvalue_level" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10 + separator: "" + } + features { + input_names: "shopping_level" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10 + separator: "" + } + features { + input_names: "occupation" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10 + separator: "" + } + features { + input_names: "new_user_class_level" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10 + separator: "" + } + features { + input_names: "adgroup_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "cate_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "campaign_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "customer" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "brand" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "price" + feature_type: RawFeature + separator: "" + } + features { + input_names: "pid" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "user_tag_cate" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "combo_brand" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "" + } + features { + input_names: "combo_cate_id" + feature_type: TagFeature + embedding_dim: 16 + hash_bucket_size: 10000 + separator: "" + } +} +model_config { + model_class: "MultiTower" + feature_groups { + group_name: "item" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + } + feature_groups { + group_name: "user" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "user_tag_cate" + wide_deep: DEEP + } + feature_groups { + group_name: "combo" + feature_names: "combo_brand" + feature_names: "combo_cate_id" + wide_deep: DEEP + } + embedding_regularization: 1e-05 + multi_tower { + towers { + input: "item" + dnn { + hidden_units: 192 + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + towers { + input: "user" + dnn { + hidden_units: 192 + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + towers { + input: "combo" + dnn { + hidden_units: 192 + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + } + } + final_dnn { + hidden_units: 256 + hidden_units: 192 + hidden_units: 128 + hidden_units: 64 + } + l2_regularization: 0.0001 + } +} +export_config { + multi_placeholder: true +} diff --git a/samples/odps_script/configs/dwd_avazu_ctr_deepmodel_ext.config b/samples/odps_script/configs/dwd_avazu_ctr_deepmodel_ext.config index cda3ba3a9..80297c016 100644 --- a/samples/odps_script/configs/dwd_avazu_ctr_deepmodel_ext.config +++ b/samples/odps_script/configs/dwd_avazu_ctr_deepmodel_ext.config @@ -24,8 +24,8 @@ train_config { } sync_replicas: true - #train_distribute: MirroredStrategy - #num_gpus_per_worker: 2 + train_distribute: MirroredStrategy + num_gpus_per_worker: 2 #is_profiling: true } diff --git a/scripts/build.sh b/scripts/build.sh index abd0f30b5..6a6c1197a 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -1,3 +1,5 @@ #!/bin/sh -cd ../ && sh -x scripts/gen_proto.sh && python3.7 setup.py sdist bdist_wheel && cp package/dist/easy*.whl . && cd - +sh -x scripts/gen_proto.sh +python setup.py sdist bdist_wheel +ls -lh dist/easy*.whl diff --git a/scripts/gen_proto.sh b/scripts/gen_proto.sh index 263df44da..7b6d16a7f 100644 --- a/scripts/gen_proto.sh +++ b/scripts/gen_proto.sh @@ -16,5 +16,4 @@ then exit 1 fi -#PATH=protoc/bin protoc/bin/protoc --doc_out=html,index.html:. easy_rec/python/protos/*.proto -#sed -i 's#

#

#g;s#

#
#g' index.html +cp easy_rec/python/protos/dataset_pb2.py easy_rec/python/protos/tf_predict_pb2.py processor/ diff --git a/scripts/kafka_test.sh b/scripts/kafka_test.sh new file mode 100644 index 000000000..e18193629 --- /dev/null +++ b/scripts/kafka_test.sh @@ -0,0 +1 @@ +kafka_install_dir=../kafka_2.13-3.1.0/ oss_path=oss://yangxi-bj/export_embedding_taobao_fg_step_0 oss_ak=xxx oss_sk=xxx oss_endpoint=oss-cn-beijing.aliyuncs.com TEST_DEVICES='' PYTHONPATH=.:pai_jobs/ python -m easy_rec.python.test.kafka_test KafkaTest.test_kafka_processor