华为云用户手册

  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs}drop_duplicates____id___ = MLSDropDuplicates(**params)drop_duplicates____id___.run()# @output {"label":"dataframe","name":"drop_duplicates____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "conditions_str": "" # @param {"label":"conditions_str","type":"string","required":"true","helpTip":""}}field_replace____id___ = MLSFieldReplace(**params)field_replace____id___.run()# @output {"label":"dataframe","name":"field_replace____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "sql_string": "" # @param {"label":"sql_string","type":"string","required":"true","helpTip":""}}execute_sql____id___ = MLSExecuteSql(**params)execute_sql____id___.run()# @output {"label":"dataframe","name":"execute_sql____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "b_output_action": True, "outer_pipeline_stages": None, "input_features_str": "", # @param {"label":"input_features_str","type":"string","required":"true","helpTip":""} "missing_value": "", # @param {"label":"missing_value","type":"number","required":"false","range":"(none,none)","helpTip":""} "strategy": "mean", # @param {"label":"strategy","type":"enum","options":"mean,median","required":"true","helpTip":""} "output_col_postfix": "_impute" # @param {"label":"output_col_postfix","type":"string","required":"true","helpTip":""}}missing_value_impute____id___ = MLSMissingValueImpute(**params)missing_value_impute____id___.run()# @output {"label":"dataframe","name":"missing_value_impute____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 输入 参数 子参数 参数说明 inputs dataframe 参数必选,表示输入的数据集;如果没有pipeline_model和decision_tree_regressor_model参数,表示直接根据数据集训练决策树回归算法得到特征重要性 pipeline_model 参数可选,如果含有该参数,表示根据上游的pyspark pipeline模型对象来计算特征重要性 decision_tree_regressor_model 参数可选,如果含有该参数,表示根据上游的决策树回归模型对象来计算特征重要性
  • 样例 inputs = { "dataframe": None, # @input {"label":"dataframe","type":"DataFrame"} "pipeline_model": None, # @input {"label":"pipeline_model","type":"PipelineModel"} "decision_tree_regressor_model": None}params = { "inputs": inputs, "input_columns_str": "", # @param {"label":"input_columns_str","type":"string","required":"false","helpTip":""} "label_col": "", # @param {"label":"label_col","type":"string","required":"true","helpTip":""} "model_input_features_col": "model_features", # @param {"label":"model_input_features_col","type":"string","required":"false","helpTip":""} "prediction_col": "prediction", # @param {"label":"prediction_col","type":"string","required":"false","helpTip":""} "max_depth": 5, # @param {"label":"max_depth","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "max_bins": 32, # @param {"label":"max_bins","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "min_instances_per_node": 1, # @param {"label":"min_instances_per_node","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "min_info_gain": 0.0, # @param {"label":"min_info_gain","type":"number","required":"false","helpTip":""} "impurity": "variance"}dt_regression_feature_importance____id___ = MLSDecisionTreeRegressorFeatureImportance(**params)dt_regression_feature_importance____id___.run()# @output {"label":"dataframe","name":"dt_regression_feature_importance____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 参数说明 参数 子参数 参数说明 input_columns_str - 数据集的特征列名组成的格式化字符串,例如: "column_a" "column_a,column_b" label_col - 目标列名 model_input_features_col - 特征向量的列名 prediction_col - 训练模型时,预测结果对应的列名,默认为"prediction" max_depth - 树的最大深度,默认为5 max_bins - 分割特征时的最大分箱个数,默认为32 min_instances_per_node - 决策树分裂时要求每个节点必须包含的实例数目,默认为1 min_info_gain - 最小信息增益,默认为0.0
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "select_columns_str": "", # @param {"label":"select_columns_str","type":"string","required":"false","helpTip":""} "n_estimators": 100, # @param {"label":"n_estimators","type":"integer","required":"false","helpTip":""} "max_samples": "auto", # @param {"label":"max_samples","type":"string","required":"false","helpTip":""} "contamination": "auto", # @param {"label":"contamination","type":"string","required":"false","helpTip":""} "max_features": 1.0, # @param {"label":"max_features","type":"number","required":"false","helpTip":""} "bootstrap": False # @param {"label":"bootstrap","type":"boolean","required":"false","helpTip":""}}isolation_forest____id___ = MLSIsolationForest(**params)isolation_forest____id___.run()# @output {"label":"dataframe","name":"isolation_forest____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None, # @input {"label":"dataframe","type":"DataFrame"} "pipeline_model": None, # @input {"label":"pipeline_model","type":"PipelineModel"} "gbt_classify_model": None}params = { "inputs": inputs, "input_columns_str": "", # @param {"label": "input_columns_str", "type": "string", "required": "false", "helpTip": ""} "label_col": "", # @param {"label": "label_col", "type": "string", "required": "true", "helpTip": ""} "model_input_features_col": "model_features", # @param {"label": "model_input_features_col", "type": "string", "required": "false", "helpTip": ""} "classifier_label_index_col": "label_index", # @param {"label": "classifier_label_index_col", "type": "string", "required": "false", "helpTip": ""} "prediction_index_col": "prediction_index", # @param {"label": "prediction_index_col", "type": "string", "required": "false", "helpTip": ""} "prediction_col": "prediction", # @param {"label": "prediction_col", "type": "string", "required": "false", "helpTip": ""} "max_depth": 5, # @param {"label": "max_depth", "type": "integer", "required": "false","range":"(0,2147483647]", "helpTip": ""} "max_bins": 32, # @param {"label": "max_bins", "type": "integer", "required": "false","range":"(0,2147483647]", "helpTip": ""} "min_instances_per_node": 1, # @param {"label": "min_instances_per_node", "type": "integer", "required": "false","range":"(0,2147483647]", "helpTip": ""} "min_info_gain": 0.0, # @param {"label": "min_info_gain", "type": "number", "required": "false", "helpTip": ""} "loss_type": "logistic", "max_iter": 20, # @param {"label": "max_iter", "type": "integer", "required": "false","range":"(0,2147483647]", "helpTip": ""} "step_size": 0.1, # @param {"label": "step_size", "type": "number", "required": "false", "helpTip": ""} "subsampling_rate": 1.0 # @param {"label": "subsampling_rate", "type": "number", "required": "false", "helpTip": ""}}gbt_classifier_feature_importance____id___ = MLSGBTClassifierFeatureImportance(**params)gbt_classifier_feature_importance____id___.run()# @output {"label":"dataframe","name":"gbt_classifier_feature_importance____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 参数说明 参数 子参数 参数说明 select_columns_str - 列名组成的格式化字符串,例如: "column_a" "column_a,column_b" n_estimators - 基学习器的数量,默认为100 max_samples - 从数据集中抽取多少个样本来训练,支持"auto"、int类型、float类型 contamination - - max_features - 从数据集中抽取多少数量的特征来训练每个基训练器 bootstrap - 构建树时,下次是否替换采样,True表示替换,False表示不替换
  • 样例 inputs = { "dataframe": None, # @input {"label":"dataframe","type":"DataFrame"} "pipeline_model": None, # @input {"label":"pipeline_model","type":"PipelineModel"} "decision_tree_classify_model": None}params = { "inputs": inputs, "input_columns_str": "", # @param {"label":"input_columns_str","type":"string","required":"false","helpTip":""} "label_col": "", # @param {"label":"label_col","type":"string","required":"true","helpTip":""} "model_input_features_col": "model_features", # @param {"label":"model_input_features_col","type":"string","required":"false","helpTip":""} "classifier_label_index_col": "label_index", # @param {"label":"classifier_label_index_col","type":"string","required":"false","helpTip":""} "prediction_index_col": "prediction_index", # @param {"label":"prediction_index_col","type":"string","required":"false","helpTip":""} "prediction_col": "prediction", # @param {"label":"prediction_col","type":"string","required":"false","helpTip":""} "max_depth": 5, # @param {"label":"max_depth","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "max_bins": 32, # @param {"label":"max_bins","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "min_instances_per_node": 1, # @param {"label":"min_instances_per_node","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""} "min_info_gain": 0.0, # @param {"label":"min_info_gain","type":"number","required":"false","helpTip":""} "impurity": "gini" # @param {"label":"impurity","type":"enum","required":"false","options":"entropy,gini","helpTip":""}}dt_classify_feature_importance____id___ = MLSDecisionTreeClassifierFeatureImportance(**params)dt_classify_feature_importance____id___.run()# @output {"label":"dataframe","name":"dt_classify_feature_importance____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 参数说明 参数 子参数 参数说明 input_columns_str - 数据集的特征列名组成的格式化字符串,例如: "column_a" "column_a,column_b" label_col - 目标列名 model_input_features_col - 特征向量的列名 classifier_label_index_col - 将目标列按照标签编码后的列名,默认为"label_index" prediction_index_col - 训练模型时,预测结果对应标签的列名,默认为"prediction_index" prediction_col - 训练模型时,预测结果对应的列名,默认为"prediction" max_depth - 树的最大深度 max_bins - 特征分裂时的最大分箱个数 min_instances_per_node - 树分裂时要求每个节点必须包含的实例数目,默认为1 min_info_gain - 最小信息增益 max_iter - 最大迭代次数 step_size - 步长 subsampling_rate - 训练每棵树时,对训练集的抽样率
  • 输入 参数 子参数 参数说明 inputs dataframe 参数必选,表示输入的数据集;如果没有pipeline_model和gbt_classify_model参数,表示直接根据数据集训练gbdt分类模型得到特征重要性 pipeline_model 参数可选,如果含有该参数,表示根据上游的pyspark pipeline模型对象pipeline_model来计算特征重要性 gbt_classify_model 参数可选,如果含有该参数,表示根据上游的gbt_classify_model对象来计算特征重要性
  • 输入 参数 子参数 参数说明 inputs dataframe 参数必选,表示输入的数据集。 如果没有pipeline_model和decision_tree_classify_model参数,表示直接根据数据集训练决策树分类算法得到特征重要性 pipeline_model 参数可选,如果含有该参数,表示根据上游的pyspark pipeline模型对象来计算特征重要性 decision_tree_classify_model 参数可选,如果含有该参数,表示根据上游的决策树分类模型对象来计算特征重要性
  • 参数说明 参数 子参数 参数说明 input_columns_str - 数据集的特征列名组成的格式化字符串,例如: "column_a" "column_a,column_b" label_col - 目标列名 model_input_features_col - 特征向量的列名 classifier_label_index_col - 将目标列按照标签编码后的列名,默认为"label_index" prediction_index_col - 训练模型时,预测结果对应标签的列名,默认为"prediction_index" prediction_col - 训练模型时,预测结果对应的列名,默认为"prediction" max_depth - 树的最大深度 max_bins - 分割特征时的最大分箱个数 min_instances_per_node - 决策树分裂时要求每个节点必须包含的实例数目 min_info_gain - 最小信息增益 impurity - 计算信息增益的标准,支持"gini"和"entropy"
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "k": "", # @param {"label":"k","type":"integer","required":"true","range":"(0,2147483647]","helpTip":""} "computeU": False, # @param {"label":"computeU","type":"boolean","required":"false","helpTip":""} "rCond": 1e-9 # @param {"label":"rCond","type":"integer","required":"false","range":"(0,2147483647]","helpTip":""}}svd____id___ = MLSSVD(**params)svd____id___.run()# @output {"label":"dataframe","name":"svd____id___.get_outputs()['output_port_1']","type":"DataFrame"}# @output {"label":"dataframe","name":"svd____id___.get_outputs()['output_port_2']","type":"DataFrame"}# @output {"label":"dataframe","name":"svd____id___.get_outputs()['output_port_3']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "b_output_action": True, "outer_pipeline_stages": None, "input_column_str": "", # @param {"label":"input_column_str","type":"string","required":"true","helpTip":""} "output_column_str": "", # @param {"label":"output_column_str","type":"string","required":"true","helpTip":""} "handle_invalid": "keep" # @param {"label":"handle_invalid","type":"enum","options":"skip,error,keep","required":"true","helpTip":""}}string_indexer____id___ = MLSStringIndexer(**params)string_indexer____id___.run()# @output {"label":"dataframe","name":"string_indexer____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "select_columns_str": "" # @param {"label":"select_columns_str","type":"string","required":"false","helpTip":""}}percentile_statistics____id___ = MLSPercentileStatistics(**params)percentile_statistics____id___.run()# @output {"label":"dataframe","name":"percentile_statistics____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 查找实例 Notebook页面展示了所有创建的实例。若需要展示特定的实例,可根据筛选条件快速查找。单击搜索框,可按照“名称”、“状态”、“计费资源类型”选择单个条件来筛选,或者同时选择多个筛选条件。 打开“查看所有”开关。打开开关后,可以看到IAM项目下所有子用户创建的Notebook实例。 按名称搜索:输入实例名称查找。 按状态筛选实例:例如快速筛选状态为“启动中”的实例。 按计费资源类型搜索:筛选计费类型为存储资源或计算资源计费的实例。 如果需要更改搜索条件,直接单击名称即可重新选择搜索条件,如下图所示。 图1 更改搜索条件
  • VS Code连接Notebook方式介绍 当用户创建完成支持SSH的Notebook实例后,使用VS Code的开发者可以通过以下三种方式连接到开发环境中: VS Code一键连接Notebook(推荐) 该方式是指在开发环境Console控制台上提供VS Code按钮,通过该入口自动打开VS Code并连接实例。 VS Code ToolKit连接Notebook(推荐) 该方式是指用户在VS Code上使用ModelArts VS Code Toolkit插件提供的登录和连接按钮,连接云上实例。 VS Code手动连接Notebook 该方式是指用户使用VS Code Remote SSH插件手工配置连接信息,连接云上实例。 父主题: 本地IDE(VS Code)
  • 前提条件 创建一个Notebook实例,并开启远程SSH开发,配置远程访问IP白名单。该实例状态必须处于“运行中”,具体参见创建Notebook实例章节。 在Notebook实例详情页面获取开发环境访问地址(例如:dev-modelarts-cnnorth4.huaweicloud.com)和端口号。 图1 Notebook实例详情页面 准备好密钥对文件。 密钥对在用户第一次创建时,自动下载,之后使用相同的密钥时不会再有下载界面(用户一定要保存好),或者每次都使用新的密钥对。
  • 了解概念 算子 在MLS中,算子是一种基本功能单元,以ipynb格式保存,实质上是一段代码,对应Notebook中的一个Cell。 在算子中,开发者可以通过代码的形式封装一系列变量、方法或类,从而实现一个独立功能。 一个算子可以是一个机器学习算法的调用封装(比如调用Spark API封装一个随机森林算法),也可以是一个简单运算逻辑(比如完成一次简单加减运算),还可以是自行实现的任意功能脚本(比如从磁盘上读取数据)。 算链 算链是由若干算子通过一定规则组合而成的复合功能单元,以JSON格式进行保存,是一个大功能或解决方案的载体。 算链中的算子之间可以通过有向无环图(DAG)的形式组合,也可不与任何算子组合,一个算链可包含若干个DAG或零散算子。 在运行过程中,通过DAG形式组合的算子将严格按照DAG顺序调度运行,而未按DAG形式组合的算子则按照添加至算链的先后顺序运行。 MLS中的一个算链可转换成一个ipynb文件或一个python文件,开发者可基于转换的文件做进一步开发。
  • 亮点特性2:丰富的预置算子 MLS提供了丰富的预置算子,覆盖了机器学习建模全流程,包含数据分析、数据处理、特征工程、模型构建、模型评估和模型应用等多种算子类型,可极大程度地增强算法代码的可复用性,减少开发者的模型构建成本并提升开发效率。 开发者可以根据实际业务需要,方便快捷地设置预置算子参数、查看和修改预置算子源码,通过在算链中对预置算子进行参数调整和代码调整构建独特的业务场景需要的AI算法。 图2 丰富的预置算子
  • ML Studio是什么 ML Studio简称MLS,是ModelArts中的一个支持可视化机器学习建模的企业级AI开发工具,支持用户通过浏览器以全代码、少代码甚至零代码的方式开发AI模型。 MLS提供了图形化模型探索开发环境、丰富的预置算子和预置算链,并支持编写自定义算子,可帮助开发者快速构建具有实用价值的机器学习应用。 MLS为AI开发者提供可视化的操作界面来编排机器学习模型的训练、评估和预测的过程,无缝衔接数据分析和预测应用,为用户的数据挖掘分析业务提供易用、高效、高性能的工具。
  • 亮点特性1:可视化建模 MLS提供了用户友好的可视化模型探索或开发环境,开发者只需要通过简单拖拉拽操作编排算子,构建算链即可完成机器学习建模。 MLS中一个算链由一组算子组成,每个算子是一个功能独立的逻辑单元,算子通过有向无环图(DAG)的形式组织。 在MLS中,开发者可在算链编排界面上直观便利地进行算子增删、算子连线、算子参数设置、算子代码修改及算子结果预览等操作,支持开发者以REPL(Read Eval Print Loop)交互式开发方式进行模型调测。 图1 ML Studio
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "b_output_action": True, "b_use_default_encoder": True, "input_features_str": "", # @param {"label":"input_features_str","type":"string","required":"false","helpTip":""} "outer_pipeline_stages": None, "pca_input_features_col": "pca_input_features", # @param {"label":"pca_input_features_col","type":"string","required":"true","helpTip":""} "pca_output_vector_col": "pca_output_features", # @param {"label":"pca_output_vector_col","type":"string","required":"true","helpTip":""} "k": 5 # @param {"label":"k","type":"integer","required":"true","range":"(0,2147483647]","helpTip":"k"}}pca____id___ = MLSPCA(**params)pca____id___.run()# @output {"label":"dataframe","name":"pca____id___.get_outputs()['output_port_1']","type":"DataFrame"}
  • 参数说明 参数 子参数 参数说明 input_features_str - 输入的特征列名以逗号分隔组成的格式化字符串,例如: "column_a" "column_a,column_b" pca_input_features_col - 输入的特征向量列名,默认为"pca_input_features" pca_output_vector_col - 输出的特征向量列名,默认为"pca_output_features" k - 主成分的维度数
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "b_output_action": True, "outer_pipeline_stages": None, "input_features_str": "", # @param {"label":"input_features_str","type":"string","required":"false","helpTip":""} "output_col": "norm_output_features", # @param {"label":"output_col","type":"string","required":"false","helpTip":""} "p": 2 # @param {"label":"p","type":"integer","required":"false","helpTip":""}}normalizer____id___ = MLSNormalizer(**params)normalizer____id___.run()# @output {"label":"pipeline_model","name":"normalizer____id___.get_outputs()['output_port_1']","type":"PipelineModel"}
  • 参数说明 参数 子参数 参数说明 input_features_str - 输入的列名以逗号分隔组成的字符串,例如: "column_a" "column_a,column_b" output_column_postfix - 结果输出的列名,在输入列名后面增加的后缀,默认为"_one_hot" handle_invalid - 是否处理无效值,支持"keep"、"error",默认为"keep"。 drop_last - 在编码向量中,是否去除最后一个类别的编码,默认为true
  • 样例 inputs = { "dataframe": None # @input {"label":"dataframe","type":"DataFrame"}}params = { "inputs": inputs, "b_output_action": True, "outer_pipeline_stages": None, "input_features_str": "", # @param {"label":"input_features_str","type":"string","required":"false","helpTip":""} "output_column_postfix": "_one_hot", # @param {"label":"output_column_postfix","type":"string","required":"true","helpTip":""} "handle_invalid": "keep", # @param {"label":"handle_invalid","type":"enum","options":"keep,error","required":"true","helpTip":""} "drop_last": True # @param {"label":"drop_last","type":"boolean","required":"true","helpTip":""}}one_hot_encoder____id___ = MLSOneHotEncoder(**params)one_hot_encoder____id___.run()# @output {"label":"pipeline_model","name":"one_hot_encoder____id___.get_outputs()['output_port_1']","type":"PipelineModel"}# @output {"label":"dataframe","name":"one_hot_encoder____id___.get_outputs()['output_port_2']","type":"DataFrame"}
共100000条