MaxComputeのSparkを使用して、データレイクハウスソリューションの外部データソースにアクセスできます。 SparkではなくMaxComputeでSparkジョブを実行する場合は、SparkジョブをMaxComputeに移行することなく、MaxComputeでSparkを使用してデータレイクハウスソリューションの外部データソースにアクセスできます。 これにより、データ処理のコストを削減できます。 このトピックでは、MaxComputeでSparkを使用して、データレイクハウスソリューションの外部データソースにアクセスする方法について説明します。
Hadoopデータソースに基づく外部プロジェクトへのアクセス
MaxCompute SQLを使用した外部プロジェクトへのアクセス
-- The project hadoop_external_project is an external project that is mapped to a Hive database of E-MapReduce (EMR). -- Access a non-partitioned table in the project. SELECT * from hadoop_external_project.testtbl; -- Access a partitioned table in the project. SELECT * from hadoop_external_project.testtbl_par where b='20220914';MaxComputeでSparkを使用して外部プロジェクトにアクセスする
-- Configure the parameters. -- Enable the features to access external tables and external projects. By default, the features that you can use to access external tables and external projects are disabled. spark.sql.odps.enableExternalTable=true spark.sql.odps.enableExternalProject=true; -- Specify the Spark version. spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0; -- Write code. import org.apache.spark.sql.SparkSession object external_Project_ReadTableHadoop { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("external_TableL-on-MaxCompute") // The timeout period of a broadcast join. The default value is 300 seconds. .config("spark.sql.broadcastTimeout", 20 * 60) // In strict mode, you must specify at least one static partition. In non-strict mode, all partitions can be dynamic. .config("odps.exec.dynamic.partition.mode", "nonstrict") .config("oss.endpoint","oss-cn-shanghai-internal.aliyuncs.com") .getOrCreate() // Access the external project ext_dlf_0713. print("=====show tables in hadoop_external_project6=====") spark.sql("show tables in hadoop_external_project6").show() // Read data from a non-partitioned table in the external project. print("===============hadoop_external_project6.testtbl;================") spark.sql("desc extended hadoop_external_project6.testtbl").show() print("===============hadoop_external_project6.testtbl;================") spark.sql("SELECT * from hadoop_external_project6.testtbl").show() // Read data from a partitioned table in the external project. print("===============hadoop_external_project6.testtbl_par;================") spark.sql("desc extended hadoop_external_project6.testtbl_par").show() print("===============hadoop_external_project6.testtbl;================") spark.sql("SELECT * from hadoop_external_project6.testtbl_par where b='20220914'").show() } }
DLFとOSSに基づく外部プロジェクトへのアクセス
MaxCompute SQLを使用した外部プロジェクトへのアクセス
-- The project ext_dlf_0713 is an external project that is mapped to a Data Lake Formation (DLF) database. -- Access a non-partitioned table in the project. SELECT * from ext_dlf_0713.tbl_oss1;MaxComputeでSparkを使用して外部プロジェクトにアクセスする
-- Configure the parameters. -- Enable the features to access external tables and external projects. By default, the features that you can use to access external tables and external projects are disabled. spark.sql.odps.enableExternalTable=true; spark.sql.odps.enableExternalProject=true; -- Specify the Spark version. spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0; -- Add the following configuration to the code if the Object Storage Service (OSS) directory where data files are stored is created by using EMR. spark.hadoop.odps.oss.location.uri.style=emr; -- Specify the endpoint of OSS when you use Spark to access an external project based on OSS. spark.hadoop.odps.oss.endpoint=oss-cn-shanghai-internal.aliyuncs.com; -- Specify the region. spark.hadoop.odps.region.id=cn-shanghai; spark.hadoop.odps.oss.region.default=cn-shanghai; -- Write code. import org.apache.spark.sql.{SaveMode, SparkSession} object external_Project_ReadTable { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("external_TableL-on-MaxCompute") // The timeout period of a broadcast join. The default value is 300 seconds. .config("spark.sql.broadcastTimeout", 20 * 60) // In strict mode, you must specify at least one static partition. In non-strict mode, all partitions can be dynamic. .config("odps.exec.dynamic.partition.mode", "nonstrict") .config("oss.endpoint","oss-cn-shanghai-internal.aliyuncs.com") .getOrCreate() // Access the external project ext_dlf_0713. print("=====show tables in ext_dlf_0713=====") spark.sql("show tables in ext_dlf_0713").show() // Read data from a non-partitioned table in the external project. print("===============ext_dlf_0713.tbl_oss1;================") spark.sql("desc extended ext_dlf_0713.tbl_oss1").show() print("===============ext_dlf_0713.tbl_oss1;================") spark.sql("SELECT * from ext_dlf_0713.tbl_oss1").show() } }