API: palantir_models.serializers reference(API:palantir_models.serializers 参考文档)¶
The palantir_models.serializers library provides many default serialization methods for saving and loading models that are trained inside Foundry; most models should be able to use one of the default model serializers.
How to write a model serializer¶
In some cases it can be useful to create a reusable auto_serializer, for example, if your organization has a model format that is reused and often integrated into Foundry as models then creating a reusable auto_serializer can standardize and reduce duplicated code across different models and teams.
To create an auto_serializer, you should extend the palantir_models.models._serialization.ModelSerializer base class and implement the __init__, serialize, and deserialize methods.
- The
__init__method is the constructor that a consumer of your serializer will interact with. - The
serializemethod will be provided with the object that you should serialize to thepalantir_models.models._state_accessors.ModelStateWriter. - The
deserializemethod will be called at model load time and allows you to reinstantiate and return the serialized model from the providedpalantir_models.models._state_accessors.ModelStateReader.
Your auto_serializer should be published as a shared Python library.
Implementation of provided serializers¶
For reference, we provide implementations of existing default serializers in palantir_models.serializers.
palantir_models.serializers.CloudPickleSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class CloudPickleSerializer(ModelSerializer[object]):
"""Serializer utilizing the cloudpickle library for generic objects"""
file_name = "cloudpickle.pkl"
cloudpickle: ModuleType
def __init__(self):
self.cloudpickle = importlib.import_module("cloudpickle")
def serialize(self, writer: ModelStateWriter, obj: object):
with writer.open(self.file_name, "wb") as cloudpickle_file:
self.cloudpickle.dump(obj, cloudpickle_file)
def deserialize(self, reader: ModelStateReader) -> object:
with reader.open(self.file_name, "rb") as cloudpickle_file:
obj = self.cloudpickle.load(cloudpickle_file)
return obj
palantir_models.serializers.DillSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class DillSerializer(ModelSerializer[object]):
"""Serializer utilizing the dill library for generic objects"""
file_name = "dill.pkl"
dill: ModuleType
def __init__(self):
self.dill = importlib.import_module("dill")
def serialize(self, writer: ModelStateWriter, obj: object):
with writer.open(self.file_name, "wb") as dill_file:
self.dill.dump(obj, dill_file, recurse=True)
def deserialize(self, reader: ModelStateReader) -> object:
with reader.open(self.file_name, "rb") as dill_file:
obj = self.dill.load(dill_file)
return obj
palantir_models.serializers.HfAutoModelSerializer¶
class HfAutoModelSerializer(ModelSerializer):
"""
Serializer for huggingface transformers AutoModel classes, using the
from_pretrained and save_pretrained methods.
Allows configuring a specific subclass (e.g. AutoModelForSequenceClassification or
BertForTokenClassification) and passing additional kwargs to from_pretrained
(e.g. num_labels=2).
"""
DIR_NAME = "model"
def __init__(self, model_class=None, **load_kwargs):
if model_class is None:
transformers = importlib.import_module("transformers")
model_class = transformers.AutoModel
self.model_class = model_class
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
model_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(model_dir)
def deserialize(self, reader: ModelStateReader):
model_dir = reader.dir(self.DIR_NAME)
return self.model_class.from_pretrained(model_dir, **self.load_kwargs)
palantir_models.serializers.HfAutoTokenizerSerializer¶
class HfAutoTokenizerSerializer(ModelSerializer):
"""
Serializer for huggingface transformers AutoTokenizer.
"""
DIR_NAME = "tokenizer"
def __init__(self, tokenizer_class=None, **load_kwargs):
if tokenizer_class is None:
transformers = importlib.import_module("transformers")
tokenizer_class = transformers.AutoTokenizer
self.tokenizer_class = tokenizer_class
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
tokenizer_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(tokenizer_dir)
def deserialize(self, reader: ModelStateReader):
tokenizer_dir = reader.dir(self.DIR_NAME)
return self.tokenizer_class.from_pretrained(tokenizer_dir, **self.load_kwargs)
palantir_models.serializers.HfPipelineSerializer¶
import importlib
from palantir_models import ModelSerializer, ModelStateReader, ModelStateWriter
class HfPipelineSerializer(ModelSerializer):
"""
Serializer for huggingface transformers pipelines.
Allows setting the pipeline task (e.g. sentiment-analysis).
"""
DIR_NAME = "pipeline"
def __init__(self, pipeline_type, **load_kwargs):
self.transformers = importlib.import_module("transformers")
self.pipeline_type = pipeline_type
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
pipeline_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(pipeline_dir)
def deserialize(self, reader: ModelStateReader):
pipeline_dir = reader.dir(self.DIR_NAME)
return self.transformers.pipeline(self.pipeline_type, model=pipeline_dir, **self.load_kwargs)
palantir_models.serializers.JsonSerializer¶
import importlib
from types import ModuleType
from typing import Dict
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class JsonSerializer(ModelSerializer[Dict]):
"""Serializer for json-convertible objects and dictionaries"""
file_name = "config.json"
json: ModuleType
def __init__(self):
self.json = importlib.import_module("json")
def serialize(self, writer: ModelStateWriter, obj: Dict):
with writer.open(self.file_name, "w") as conf:
self.json.dump(obj, conf)
def deserialize(self, reader: ModelStateReader) -> Dict:
with reader.open(self.file_name, "r") as conf:
return self.json.load(conf)
palantir_models.serializers.PytorchStateSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class PytorchStateSerializer(ModelSerializer):
"""Serializer for PyTorch state dictionaries."""
STATE_DICT_FILE_NAME = "model_state_dict.pt"
torch: ModuleType
def __init__(self):
self.torch = importlib.import_module("torch")
def serialize(self, writer: ModelStateWriter, obj: dict):
"""Serializes the state_dict of a PyTorch model."""
with writer.open(self.STATE_DICT_FILE_NAME, "wb") as file_path:
self.torch.save(obj, file_path)
def deserialize(self, reader: ModelStateReader) -> dict:
"""Deserializes the state_dict of a PyTorch model."""
with reader.open(self.STATE_DICT_FILE_NAME, "rb") as file_path:
state_dict = self.torch.load(file_path)
return state_dict
palantir_models.serializers.TensorflowKerasSerializer¶
import enum
import importlib
import os
from types import ModuleType
from typing import Any, Dict, Optional
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateReader, ModelStateWriter
class TensorflowFormat(enum.Enum):
DIR = 0
H5 = 1
KERAS = 2
def get_save_path(self, dir_path):
if self == TensorflowFormat.DIR:
return dir_path
if self == TensorflowFormat.H5:
return os.path.join(dir_path, "model.h5")
if self == TensorflowFormat.KERAS:
return os.path.join(dir_path, "model.keras")
class TensorflowKerasSerializer(ModelSerializer):
"""Serializer for tensorflow keras models"""
DIR_NAME: str = "tensorflow_saved_model_dir"
__tensorflow: ModuleType
def __init__(self, format=TensorflowFormat.DIR, custom_objects: Optional[Dict[str, Any]] = None):
self.__tensorflow = importlib.import_module("tensorflow")
self.__custom_objects = custom_objects
self.__format = format
def serialize(self, writer: ModelStateWriter, obj: "tensorflow.keras.Model"):
dir_path = writer.mkdir(self.DIR_NAME)
self._save_model(dir_path, obj)
def deserialize(self, reader: ModelStateReader) -> "tensorflow.keras.Model":
dir_path = reader.dir(self.DIR_NAME)
obj = self._load_model(dir_path)
obj.compile()
return obj
def _save_model(self, dir_path: str, obj: "tensorflow.keras.Model"):
try:
save_path = self.__format.get_save_path(dir_path)
obj.save(save_path)
except ValueError as exc:
if "Invalid filepath extension for saving" in str(exc):
raise ValueError(
"Serialization failed due to invalid save format. Specify the correct file format in the TensorflowKerasSerializer constructor:\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.DIR) to use directory saving. This save format is considered legacy in Tensorflow\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.H5) to use .h5 file format saving\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.KERAS) to use .keras file format saving\n"
) from exc
raise exc
def _load_model(self, dir_path: str) -> "tensorflow.keras.Model":
save_path = self.__format.get_save_path(dir_path)
return self.__tensorflow.keras.models.load_model(save_path, custom_objects=self.__custom_objects, compile=False)
palantir_models.serializers.XGBoostSerializer¶
from palantir_models import ModelSerializer
from palantir_models.models._serialization import ModelStateReader, ModelStateWriter
from xgboost.sklearn import XGBModel
class XGBoostSerializer(ModelSerializer[XGBModel]):
"""Simple Serializer for XGBoost SkLearn Models."""
file_name = "xgboost_model.json"
def serialize(self, writer: ModelStateWriter, obj: XGBModel):
with writer.open(self.file_name, "w") as xgbfile:
obj.save_model(xgbfile.name)
def deserialize(self, reader: ModelStateReader) -> XGBModel:
model = XGBModel()
with reader.open(self.file_name, "r") as xgbfile:
model.load_model(xgbfile.name)
return model
palantir_models.serializers.YamlSerializer¶
import importlib
from types import ModuleType
from typing import Dict
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class YamlSerializer(ModelSerializer[Dict]):
"""Serializer for yaml-convertible objects and dictionaries"""
file_name = "config.yaml"
yaml: ModuleType
def __init__(self):
self.yaml = importlib.import_module("yaml")
def serialize(self, writer: ModelStateWriter, obj: Dict):
with writer.open(self.file_name, "w") as conf:
self.yaml.safe_dump(obj, conf)
def deserialize(self, reader: ModelStateReader) -> Dict:
with reader.open(self.file_name, "r") as conf:
return self.yaml.safe_load(conf)
中文翻译¶
API:palantir_models.serializers 参考文档¶
palantir_models.serializers 库提供了许多默认序列化方法,用于保存和加载在 Foundry 内部训练的模型;大多数模型应能够使用其中一个默认模型序列化器。
如何编写模型序列化器¶
在某些情况下,创建可复用的 auto_serializer 会非常有用。例如,如果您的组织拥有一种可复用的模型格式,并且经常将其作为模型集成到 Foundry 中,那么创建一个可复用的 auto_serializer 可以标准化并减少不同模型和团队之间的重复代码。
要创建 auto_serializer,您需要继承 palantir_models.models._serialization.ModelSerializer 基类,并实现 __init__、serialize 和 deserialize 方法。
__init__方法是构造函数,序列化器的使用者将与之交互。serialize方法将接收一个对象,您需要将其序列化到palantir_models.models._state_accessors.ModelStateWriter。deserialize方法将在模型加载时被调用,它允许您从提供的palantir_models.models._state_accessors.ModelStateReader中重新实例化并返回序列化的模型。
您的 auto_serializer 应作为共享 Python 库发布。
提供的序列化器实现¶
作为参考,我们在 palantir_models.serializers 中提供了现有默认序列化器的实现。
palantir_models.serializers.CloudPickleSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class CloudPickleSerializer(ModelSerializer[object]):
"""利用 cloudpickle 库对通用对象进行序列化的序列化器"""
file_name = "cloudpickle.pkl"
cloudpickle: ModuleType
def __init__(self):
self.cloudpickle = importlib.import_module("cloudpickle")
def serialize(self, writer: ModelStateWriter, obj: object):
with writer.open(self.file_name, "wb") as cloudpickle_file:
self.cloudpickle.dump(obj, cloudpickle_file)
def deserialize(self, reader: ModelStateReader) -> object:
with reader.open(self.file_name, "rb") as cloudpickle_file:
obj = self.cloudpickle.load(cloudpickle_file)
return obj
palantir_models.serializers.DillSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class DillSerializer(ModelSerializer[object]):
"""利用 dill 库对通用对象进行序列化的序列化器"""
file_name = "dill.pkl"
dill: ModuleType
def __init__(self):
self.dill = importlib.import_module("dill")
def serialize(self, writer: ModelStateWriter, obj: object):
with writer.open(self.file_name, "wb") as dill_file:
self.dill.dump(obj, dill_file, recurse=True)
def deserialize(self, reader: ModelStateReader) -> object:
with reader.open(self.file_name, "rb") as dill_file:
obj = self.dill.load(dill_file)
return obj
palantir_models.serializers.HfAutoModelSerializer¶
class HfAutoModelSerializer(ModelSerializer):
"""
用于 huggingface transformers AutoModel 类的序列化器,
使用 from_pretrained 和 save_pretrained 方法。
允许配置特定的子类(例如 AutoModelForSequenceClassification 或
BertForTokenClassification)并向 from_pretrained 传递额外的 kwargs
(例如 num_labels=2)。
"""
DIR_NAME = "model"
def __init__(self, model_class=None, **load_kwargs):
if model_class is None:
transformers = importlib.import_module("transformers")
model_class = transformers.AutoModel
self.model_class = model_class
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
model_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(model_dir)
def deserialize(self, reader: ModelStateReader):
model_dir = reader.dir(self.DIR_NAME)
return self.model_class.from_pretrained(model_dir, **self.load_kwargs)
palantir_models.serializers.HfAutoTokenizerSerializer¶
class HfAutoTokenizerSerializer(ModelSerializer):
"""
用于 huggingface transformers AutoTokenizer 的序列化器。
"""
DIR_NAME = "tokenizer"
def __init__(self, tokenizer_class=None, **load_kwargs):
if tokenizer_class is None:
transformers = importlib.import_module("transformers")
tokenizer_class = transformers.AutoTokenizer
self.tokenizer_class = tokenizer_class
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
tokenizer_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(tokenizer_dir)
def deserialize(self, reader: ModelStateReader):
tokenizer_dir = reader.dir(self.DIR_NAME)
return self.tokenizer_class.from_pretrained(tokenizer_dir, **self.load_kwargs)
palantir_models.serializers.HfPipelineSerializer¶
import importlib
from palantir_models import ModelSerializer, ModelStateReader, ModelStateWriter
class HfPipelineSerializer(ModelSerializer):
"""
用于 huggingface transformers pipelines 的序列化器。
允许设置 pipeline 任务(例如 sentiment-analysis)。
"""
DIR_NAME = "pipeline"
def __init__(self, pipeline_type, **load_kwargs):
self.transformers = importlib.import_module("transformers")
self.pipeline_type = pipeline_type
self.load_kwargs = load_kwargs
def serialize(self, writer: ModelStateWriter, obj):
pipeline_dir = writer.mkdir(self.DIR_NAME)
obj.save_pretrained(pipeline_dir)
def deserialize(self, reader: ModelStateReader):
pipeline_dir = reader.dir(self.DIR_NAME)
return self.transformers.pipeline(self.pipeline_type, model=pipeline_dir, **self.load_kwargs)
palantir_models.serializers.JsonSerializer¶
import importlib
from types import ModuleType
from typing import Dict
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class JsonSerializer(ModelSerializer[Dict]):
"""用于可转换为 JSON 的对象和字典的序列化器"""
file_name = "config.json"
json: ModuleType
def __init__(self):
self.json = importlib.import_module("json")
def serialize(self, writer: ModelStateWriter, obj: Dict):
with writer.open(self.file_name, "w") as conf:
self.json.dump(obj, conf)
def deserialize(self, reader: ModelStateReader) -> Dict:
with reader.open(self.file_name, "r") as conf:
return self.json.load(conf)
palantir_models.serializers.PytorchStateSerializer¶
import importlib
from types import ModuleType
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class PytorchStateSerializer(ModelSerializer):
"""用于 PyTorch 状态字典的序列化器。"""
STATE_DICT_FILE_NAME = "model_state_dict.pt"
torch: ModuleType
def __init__(self):
self.torch = importlib.import_module("torch")
def serialize(self, writer: ModelStateWriter, obj: dict):
"""序列化 PyTorch 模型的 state_dict。"""
with writer.open(self.STATE_DICT_FILE_NAME, "wb") as file_path:
self.torch.save(obj, file_path)
def deserialize(self, reader: ModelStateReader) -> dict:
"""反序列化 PyTorch 模型的 state_dict。"""
with reader.open(self.STATE_DICT_FILE_NAME, "rb") as file_path:
state_dict = self.torch.load(file_path)
return state_dict
palantir_models.serializers.TensorflowKerasSerializer¶
import enum
import importlib
import os
from types import ModuleType
from typing import Any, Dict, Optional
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateReader, ModelStateWriter
class TensorflowFormat(enum.Enum):
DIR = 0
H5 = 1
KERAS = 2
def get_save_path(self, dir_path):
if self == TensorflowFormat.DIR:
return dir_path
if self == TensorflowFormat.H5:
return os.path.join(dir_path, "model.h5")
if self == TensorflowFormat.KERAS:
return os.path.join(dir_path, "model.keras")
class TensorflowKerasSerializer(ModelSerializer):
"""用于 tensorflow keras 模型的序列化器"""
DIR_NAME: str = "tensorflow_saved_model_dir"
__tensorflow: ModuleType
def __init__(self, format=TensorflowFormat.DIR, custom_objects: Optional[Dict[str, Any]] = None):
self.__tensorflow = importlib.import_module("tensorflow")
self.__custom_objects = custom_objects
self.__format = format
def serialize(self, writer: ModelStateWriter, obj: "tensorflow.keras.Model"):
dir_path = writer.mkdir(self.DIR_NAME)
self._save_model(dir_path, obj)
def deserialize(self, reader: ModelStateReader) -> "tensorflow.keras.Model":
dir_path = reader.dir(self.DIR_NAME)
obj = self._load_model(dir_path)
obj.compile()
return obj
def _save_model(self, dir_path: str, obj: "tensorflow.keras.Model"):
try:
save_path = self.__format.get_save_path(dir_path)
obj.save(save_path)
except ValueError as exc:
if "Invalid filepath extension for saving" in str(exc):
raise ValueError(
"序列化失败,因为保存格式无效。请在 TensorflowKerasSerializer 构造函数中指定正确的文件格式:\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.DIR) 使用目录保存。此保存格式在 Tensorflow 中被视为旧版格式\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.H5) 使用 .h5 文件格式保存\n"
"\tTensorflowKerasSerializer(format=TensorflowFormat.KERAS) 使用 .keras 文件格式保存\n"
) from exc
raise exc
def _load_model(self, dir_path: str) -> "tensorflow.keras.Model":
save_path = self.__format.get_save_path(dir_path)
return self.__tensorflow.keras.models.load_model(save_path, custom_objects=self.__custom_objects, compile=False)
palantir_models.serializers.XGBoostSerializer¶
from palantir_models import ModelSerializer
from palantir_models.models._serialization import ModelStateReader, ModelStateWriter
from xgboost.sklearn import XGBModel
class XGBoostSerializer(ModelSerializer[XGBModel]):
"""用于 XGBoost SkLearn 模型的简单序列化器。"""
file_name = "xgboost_model.json"
def serialize(self, writer: ModelStateWriter, obj: XGBModel):
with writer.open(self.file_name, "w") as xgbfile:
obj.save_model(xgbfile.name)
def deserialize(self, reader: ModelStateReader) -> XGBModel:
model = XGBModel()
with reader.open(self.file_name, "r") as xgbfile:
model.load_model(xgbfile.name)
return model
palantir_models.serializers.YamlSerializer¶
import importlib
from types import ModuleType
from typing import Dict
from palantir_models.models._serialization import ModelSerializer
from palantir_models.models._state_accessors import ModelStateWriter, ModelStateReader
class YamlSerializer(ModelSerializer[Dict]):
"""用于可转换为 YAML 的对象和字典的序列化器"""
file_name = "config.yaml"
yaml: ModuleType
def __init__(self):
self.yaml = importlib.import_module("yaml")
def serialize(self, writer: ModelStateWriter, obj: Dict):
with writer.open(self.file_name, "w") as conf:
self.yaml.safe_dump(obj, conf)
def deserialize(self, reader: ModelStateReader) -> Dict:
with reader.open(self.file_name, "r") as conf:
return self.yaml.safe_load(conf)