You can use the console or SDK for Python to submit deep learning jobs that use the public resource group. This topic describes how to download and install the SDK for Python and use the SDK for Python to submit deep learning jobs that use the public resource group.
Prerequisites
- The public resource group of Deep Learning Containers (DLC) is activated. For more information, see Prepare a cluster of the public resource group type.
- The required service-linked role is assigned to DLC. For more information, see Grant permissions.
- The AccessKey pair of your Alibaba Cloud account is obtained. For more information, see Obtain an AccessKey pair.
Background information
For more information about how to use the console to submit deep learning jobs, see Submit jobs by using the console. You can also log on to Data Science Workshop (DSW) Dev and perform the operations described in this topic.
Step 1: Install the SDK for Python
- Install the workspace SDK.
pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-a397e06e-edea-4756-ad50-fe7c6f7b0bf4-python-aiworkspace.zip
- Install the DLC SDK.
# Install python2 sdk. pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python2-pai-dlc.zip # Install python3 sdk. pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python-pai-dlc.zip
Step 2: Create and submit a deep learning job
Use the public resource group to create and submit a deep learning job.
When you create the job, you need to specify the AccessKey pair of your Alibaba Cloud account. For more information about how to obtain the AccessKey pair, see
Obtain an AccessKey pair. You can use the following sample code to create and submit a deep learning job:
#!/usr/bin/env python3
from __future__ import print_function
import json
import time
from alibabacloud_tea_openapi.models import Config
from alibabacloud_pai_dlc20201203.client import Client as DLCClient
from alibabacloud_pai_dlc20201203.models import (
ListJobsRequest,
ListEcsSpecsRequest,
CreateJobRequest
)
from alibabacloud_aiworkspace20210204.client import Client as AIWorkspaceClient
from alibabacloud_aiworkspace20210204.models import (
ListWorkspacesRequest,
CreateDatasetRequest,
ListDatasetsRequest,
ListImagesRequest,
ListCodeSourcesRequest
)
def create_nas_dataset(client, region, workspace_id, name,
nas_id, nas_path, mount_path):
'''Create a NAS dataset.
'''
response = client.create_dataset(CreateDatasetRequest(
workspace_id=workspace_id,
name=name,
data_type='COMMON',
data_source_type='NAS',
property='DIRECTORY',
uri=f'nas://{nas_id}.{region}{nas_path}',
accessibility='PRIVATE',
source_type='USER',
options=json.dumps({
'mountPath': mount_path
})
))
return response.body.dataset_id
def create_oss_dataset(client, region, workspace_id, name,
oss_bucket, oss_endpoint, oss_path, mount_path):
'''Create an Object Storage Service (OSS) dataset.
'''
response = client.create_dataset(CreateDatasetRequest(
workspace_id=workspace_id,
name=name,
data_type='COMMON',
data_source_type='OSS',
property='DIRECTORY',
uri=f'oss://{oss_bucket}.{oss_endpoint}{oss_path}',
accessibility='PRIVATE',
source_type='USER',
options=json.dumps({
'mountPath': mount_path
})
))
return response.body.dataset_id
def wait_for_job_to_terminate(client, job_id):
while True:
job = client.get_job(job_id).body
print('job is {}'.format(job.status))
if job.status in ('Succeeded', 'Failed', 'Stopped'):
return job.status
time.sleep(5)
return None
def main():
# Make sure that your Alibaba Cloud account is granted the required permissions on DLC.
region_id = 'cn-hangzhou'
access_key_id = '<Your AccessKey ID>'
access_key_secret = '<Your AccessKey secret>'
# 1. create client;
workspace_client = AIWorkspaceClient(
Config(access_key_id=access_key_id,
access_key_secret=access_key_secret,
region_id=region_id,
endpoint='aiworkspace.{}.aliyuncs.com'.format(region_id)))
dlc_client = DLCClient(
Config(access_key_id=access_key_id,
access_key_secret=access_key_secret,
region_id=region_id,
endpoint='pai-dlc.{}.aliyuncs.com'.format(region_id)))
print('------- Workspaces -----------')
# Obtain the workspace list. You can specify the name of the workspace that you created in the workspace_name parameter.
workspaces = workspace_client.list_workspaces(ListWorkspacesRequest(
page_number=1, page_size=1, workspace_name='',
module_list='PAI'
))
for workspace in workspaces.body.workspaces:
print(workspace.workspace_name, workspace.workspace_id,
workspace.status, workspace.creator)
if len(workspaces.body.workspaces) == 0:
raise RuntimeError('found no workspaces')
workspace_id = workspaces.body.workspaces[0].workspace_id
print('------- Images ------------')
# Obtain the image list.
images = workspace_client.list_images(ListImagesRequest(
labels=','.join(['system.supported.dlc=true',
'system.framework=Tensorflow 1.15',
'system.pythonVersion=3.6',
'system.chipType=CPU'])))
for image in images.body.images:
print(json.dumps(image.to_map(), indent=2))
image_uri = images.body.images[0].image_uri
print('------- Datasets ----------')
# Obtain the dataset.
datasets = workspace_client.list_datasets(ListDatasetsRequest(
workspace_id=workspace_id,
name='example-nas-data', properties='DIRECTORY'))
for dataset in datasets.body.datasets:
print(dataset.name, dataset.dataset_id, dataset.uri, dataset.options)
if len(datasets.body.datasets) == 0:
# Create a dataset when the specified dataset does not exist.
dataset_id = create_nas_dataset(
client=workspace_client,
region=region_id,
workspace_id=workspace_id,
name='example-nas-data',
# The ID of the NAS file system.
# General-purpose NAS: 31a8e4****.
# Extreme NAS: The ID must start with extreme-. Example: extreme-0015****.
# CPFS: The ID must start with cpfs-. Example: cpfs-125487****.
nas_id='***',
nas_path='/',
mount_path='/mnt/data/nas')
print('create dataset with id: {}'.format(dataset_id))
else:
dataset_id = datasets.body.datasets[0].dataset_id
print('------- Code Sources ----------')
# Obtain the source code file list.
code_sources = workspace_client.list_code_sources(ListCodeSourcesRequest(
workspace_id=workspace_id))
for code_source in code_sources.body.code_sources:
print(code_source.display_name, code_source.code_source_id, code_source.code_repo)
print('-------- ECS SPECS ----------')
# Obtain the DLC node specification list.
ecs_specs = dlc_client.list_ecs_specs(ListEcsSpecsRequest(page_size=100, sort_by='Memory', order='asc'))
for spec in ecs_specs.body.ecs_specs:
print(spec.instance_type, spec.cpu, spec.memory, spec.memory, spec.gpu_type)
print('-------- Create Job ----------')
# Create a deep learning job.
create_job_resp = dlc_client.create_job(CreateJobRequest().from_map({
'WorkspaceId': workspace_id,
'DisplayName': 'sample-dlc-job',
'JobType': 'TFJob',
'JobSpecs': [
{
"Type": "Worker",
"Image": image_uri,
"PodCount": 1,
"EcsSpec": ecs_specs.body.ecs_specs[0].instance_type,
"UseSpotInstance": False,
},
],
"UserCommand": "echo 'Hello World' && ls -R /mnt/data/ && sleep 30 && echo 'DONE'",
'DataSources': [
{
"DataSourceId": dataset_id,
},
],
}))
job_id = create_job_resp.body.job_id
wait_for_job_to_terminate(dlc_client, job_id)
print('-------- List Jobs ----------')
# Obtain the deep learning job list.
jobs = dlc_client.list_jobs(ListJobsRequest(
workspace_id=workspace_id,
page_number=1,
page_size=10,
))
for job in jobs.body.jobs:
print(job.display_name, job.job_id, job.workspace_name,
job.status, job.job_type)
pass
if __name__ == '__main__':
main()