You can use the console or SDK for Python to submit deep learning jobs that use the public resource group. This topic describes how to download and install the SDK for Python and use the SDK for Python to submit deep learning jobs that use the public resource group.

Prerequisites

Background information

For more information about how to use the console to submit deep learning jobs, see Submit jobs by using the console. You can also log on to Data Science Workshop (DSW) Dev and perform the operations described in this topic.

Step 1: Install the SDK for Python

  • Install the workspace SDK.
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-a397e06e-edea-4756-ad50-fe7c6f7b0bf4-python-aiworkspace.zip
  • Install the DLC SDK.
    # Install python2 sdk. 
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python2-pai-dlc.zip
    # Install python3 sdk. 
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-b7e79745-b9b1-4060-946c-05b1dca491bc-python-pai-dlc.zip

Step 2: Create and submit a deep learning job

Use the public resource group to create and submit a deep learning job.

When you create the job, you need to specify the AccessKey pair of your Alibaba Cloud account. For more information about how to obtain the AccessKey pair, see Obtain an AccessKey pair. You can use the following sample code to create and submit a deep learning job:
#!/usr/bin/env python3

from __future__ import print_function

import json
import time

from alibabacloud_tea_openapi.models import Config

from alibabacloud_pai_dlc20201203.client import Client as DLCClient
from alibabacloud_pai_dlc20201203.models import (
    ListJobsRequest,
    ListEcsSpecsRequest,
    CreateJobRequest
)

from alibabacloud_aiworkspace20210204.client import Client as AIWorkspaceClient
from alibabacloud_aiworkspace20210204.models import (
    ListWorkspacesRequest,
    CreateDatasetRequest,
    ListDatasetsRequest,
    ListImagesRequest,
    ListCodeSourcesRequest
)


def create_nas_dataset(client, region, workspace_id, name,
                       nas_id, nas_path, mount_path):
    '''Create a NAS dataset. 
    '''
    response = client.create_dataset(CreateDatasetRequest(
        workspace_id=workspace_id,
        name=name,
        data_type='COMMON',
        data_source_type='NAS',
        property='DIRECTORY',
        uri=f'nas://{nas_id}.{region}{nas_path}',
        accessibility='PRIVATE',
        source_type='USER',
        options=json.dumps({
            'mountPath': mount_path
        })
    ))
    return response.body.dataset_id


def create_oss_dataset(client, region, workspace_id, name,
                       oss_bucket, oss_endpoint, oss_path, mount_path):
    '''Create an Object Storage Service (OSS) dataset. 
    '''
    response = client.create_dataset(CreateDatasetRequest(
        workspace_id=workspace_id,
        name=name,
        data_type='COMMON',
        data_source_type='OSS',
        property='DIRECTORY',
        uri=f'oss://{oss_bucket}.{oss_endpoint}{oss_path}',
        accessibility='PRIVATE',
        source_type='USER',
        options=json.dumps({
            'mountPath': mount_path
        })
    ))
    return response.body.dataset_id



def wait_for_job_to_terminate(client, job_id):
    while True:
        job = client.get_job(job_id).body
        print('job is {}'.format(job.status))
        if job.status in ('Succeeded', 'Failed', 'Stopped'):
            return job.status
        time.sleep(5)
    return None


def main():

    # Make sure that your Alibaba Cloud account is granted the required permissions on DLC. 
    region_id = 'cn-hangzhou'
    access_key_id = '<Your AccessKey ID>'
    access_key_secret = '<Your AccessKey secret>'

    # 1. create client;
    workspace_client = AIWorkspaceClient(
        Config(access_key_id=access_key_id,
               access_key_secret=access_key_secret,
               region_id=region_id,
               endpoint='aiworkspace.{}.aliyuncs.com'.format(region_id)))

    dlc_client = DLCClient(
        Config(access_key_id=access_key_id,
               access_key_secret=access_key_secret,
               region_id=region_id,
               endpoint='pai-dlc.{}.aliyuncs.com'.format(region_id)))

    print('------- Workspaces -----------')
    # Obtain the workspace list. You can specify the name of the workspace that you created in the workspace_name parameter. 
    workspaces = workspace_client.list_workspaces(ListWorkspacesRequest(
        page_number=1, page_size=1, workspace_name='',
        module_list='PAI'
    ))
    for workspace in workspaces.body.workspaces:
        print(workspace.workspace_name, workspace.workspace_id,
              workspace.status, workspace.creator)

    if len(workspaces.body.workspaces) == 0:
        raise RuntimeError('found no workspaces')

    workspace_id = workspaces.body.workspaces[0].workspace_id

    print('------- Images ------------')
    # Obtain the image list. 
    images = workspace_client.list_images(ListImagesRequest(
        labels=','.join(['system.supported.dlc=true',
                         'system.framework=Tensorflow 1.15',
                         'system.pythonVersion=3.6',
                         'system.chipType=CPU'])))
    for image in images.body.images:
        print(json.dumps(image.to_map(), indent=2))

    image_uri = images.body.images[0].image_uri

    print('------- Datasets ----------')
    # Obtain the dataset. 
    datasets = workspace_client.list_datasets(ListDatasetsRequest(
        workspace_id=workspace_id,
        name='example-nas-data', properties='DIRECTORY'))
    for dataset in datasets.body.datasets:
        print(dataset.name, dataset.dataset_id, dataset.uri, dataset.options)

    if len(datasets.body.datasets) == 0:
        # Create a dataset when the specified dataset does not exist. 
        dataset_id = create_nas_dataset(
            client=workspace_client,
            region=region_id,
            workspace_id=workspace_id,
            name='example-nas-data',
            # The ID of the NAS file system. 
            # General-purpose NAS: 31a8e4****. 
            # Extreme NAS: The ID must start with extreme-. Example: extreme-0015****. 
            # CPFS: The ID must start with cpfs-. Example: cpfs-125487****. 
            nas_id='***',
            nas_path='/',
            mount_path='/mnt/data/nas')
        print('create dataset with id: {}'.format(dataset_id))
    else:
        dataset_id = datasets.body.datasets[0].dataset_id

    print('------- Code Sources ----------')
    # Obtain the source code file list. 
    code_sources = workspace_client.list_code_sources(ListCodeSourcesRequest(
        workspace_id=workspace_id))
    for code_source in code_sources.body.code_sources:
        print(code_source.display_name, code_source.code_source_id, code_source.code_repo)

    print('-------- ECS SPECS ----------')
    # Obtain the DLC node specification list. 
    ecs_specs = dlc_client.list_ecs_specs(ListEcsSpecsRequest(page_size=100, sort_by='Memory', order='asc'))
    for spec in ecs_specs.body.ecs_specs:
        print(spec.instance_type, spec.cpu, spec.memory, spec.memory, spec.gpu_type)

    print('-------- Create Job ----------')
    # Create a deep learning job. 
    create_job_resp = dlc_client.create_job(CreateJobRequest().from_map({
        'WorkspaceId': workspace_id,
        'DisplayName': 'sample-dlc-job',
        'JobType': 'TFJob',
        'JobSpecs': [
            {
                "Type": "Worker",
                "Image": image_uri,
                "PodCount": 1,
                "EcsSpec": ecs_specs.body.ecs_specs[0].instance_type,
                "UseSpotInstance": False,
            },
        ],
        "UserCommand": "echo 'Hello World' && ls -R /mnt/data/ && sleep 30 && echo 'DONE'",
        'DataSources': [
            {
                "DataSourceId": dataset_id,
            },
        ],
    }))
    job_id = create_job_resp.body.job_id

    wait_for_job_to_terminate(dlc_client, job_id)

    print('-------- List Jobs ----------')
    # Obtain the deep learning job list. 
    jobs = dlc_client.list_jobs(ListJobsRequest(
        workspace_id=workspace_id,
        page_number=1,
        page_size=10,
    ))
    for job in jobs.body.jobs:
        print(job.display_name, job.job_id, job.workspace_name,
              job.status, job.job_type)
    pass


if __name__ == '__main__':
    main()