Use Spark on MaxCompute to access external data sources of the data lakehouse solution - MaxCompute

Spark on MaxCompute can be used to access external data sources of the data lakehouse solution. If you want to run Spark jobs in MaxCompute instead of Spark, you can use Spark on MaxCompute to access external data sources of the data lakehouse solution without the need to migrate the Spark jobs to MaxCompute. This can reduce the cost of data processing. This topic describes how to use Spark on MaxCompute to access external data sources of the data lakehouse solution.

Access an external project based on a Hadoop data source

Access an external project by using MaxCompute SQL

-- The project hadoop_external_project is an external project that is mapped to a Hive database of E-MapReduce (EMR).
-- Access a non-partitioned table in the project.
SELECT * from hadoop_external_project.testtbl;
-- Access a partitioned table in the project.
SELECT * from hadoop_external_project.testtbl_par where b='20220914';

Access an external project by using Spark on MaxCompute

-- Configure the parameters.
-- Enable the features to access external tables and external projects. By default, the features that you can use to access external tables and external projects are disabled.
spark.sql.odps.enableExternalTable=true
spark.sql.odps.enableExternalProject=true;
-- Specify the Spark version.
spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0;

-- Write code.
import org.apache.spark.sql.SparkSession

object external_Project_ReadTableHadoop {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("external_TableL-on-MaxCompute")
      // The timeout period of a broadcast join. The default value is 300 seconds. 
      .config("spark.sql.broadcastTimeout", 20 * 60)
      // In strict mode, you must specify at least one static partition. In non-strict mode, all partitions can be dynamic. 
      .config("odps.exec.dynamic.partition.mode", "nonstrict")
      .config("oss.endpoint","oss-cn-shanghai-internal.aliyuncs.com")
      .getOrCreate()

    // Access the external project ext_dlf_0713.
    print("=====show tables in hadoop_external_project6=====")
    spark.sql("show tables in hadoop_external_project6").show()

    // Read data from a non-partitioned table in the external project.
    print("===============hadoop_external_project6.testtbl;================")
    spark.sql("desc extended hadoop_external_project6.testtbl").show()
    print("===============hadoop_external_project6.testtbl;================")
    spark.sql("SELECT * from hadoop_external_project6.testtbl").show()

    // Read data from a partitioned table in the external project.
    print("===============hadoop_external_project6.testtbl_par;================")
    spark.sql("desc extended hadoop_external_project6.testtbl_par").show()
    print("===============hadoop_external_project6.testtbl;================")
    spark.sql("SELECT * from hadoop_external_project6.testtbl_par where b='20220914'").show()

  }

}

Access an external project based on DLF and OSS

Access an external project by using MaxCompute SQL

-- The project ext_dlf_0713 is an external project that is mapped to a Data Lake Formation (DLF) database.
-- Access a non-partitioned table in the project.
SELECT * from ext_dlf_0713.tbl_oss1;

Access an external project by using Spark on MaxCompute

-- Configure the parameters.
-- Enable the features to access external tables and external projects. By default, the features that you can use to access external tables and external projects are disabled.
spark.sql.odps.enableExternalTable=true;
spark.sql.odps.enableExternalProject=true;
-- Specify the Spark version.
spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0;
-- Add the following configuration to the code if the Object Storage Service (OSS) directory where data files are stored is created by using EMR.
spark.hadoop.odps.oss.location.uri.style=emr;
-- Specify the endpoint of OSS when you use Spark to access an external project based on OSS.
spark.hadoop.odps.oss.endpoint=oss-cn-shanghai-internal.aliyuncs.com;
-- Specify the region.
spark.hadoop.odps.region.id=cn-shanghai;
spark.hadoop.odps.oss.region.default=cn-shanghai;

-- Write code.
import org.apache.spark.sql.{SaveMode, SparkSession}
object external_Project_ReadTable {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("external_TableL-on-MaxCompute")
      // The timeout period of a broadcast join. The default value is 300 seconds. 
      .config("spark.sql.broadcastTimeout", 20 * 60)
      // In strict mode, you must specify at least one static partition. In non-strict mode, all partitions can be dynamic. 
      .config("odps.exec.dynamic.partition.mode", "nonstrict")
      .config("oss.endpoint","oss-cn-shanghai-internal.aliyuncs.com")
      .getOrCreate()

    // Access the external project ext_dlf_0713.
    print("=====show tables in ext_dlf_0713=====")
    spark.sql("show tables in ext_dlf_0713").show()

    // Read data from a non-partitioned table in the external project.
    print("===============ext_dlf_0713.tbl_oss1;================")
    spark.sql("desc extended ext_dlf_0713.tbl_oss1").show()
    print("===============ext_dlf_0713.tbl_oss1;================")
    spark.sql("SELECT * from ext_dlf_0713.tbl_oss1").show()


  }

}