Example Import GCP To ADLS
Example Import GCP To ADLS
Example Import GCP To ADLS
if (partition_column == "" ):
partition_column = 0
flagdelta = False
if format_file == 'delta':
flagdelta = True
format_file = 'parquet'
#OPTIONAL
def printf_args():
print('************* Variables to connect Databricks *************')
print('Container where container exists:\t {}'.format(container))
print('Container where acct exists:\t {}'.format(acct))
print('Format_file to read:\t {}'.format(format_file))
print('Path where file exists:\t {}'.format(input_path))
print('Path where file will exists in bucket:\t {}'.format(output_path))
print('Path partition_column :\t {}'.format(partition_column))
print('delimiter:\t {}'.format(delimiter))
# astro-variables-ADLS_MX_DOFF_IBP
def create_spark_session():
print ('Number of arguments:\t {}'.format(len(arguments)))
spark = SparkSession.builder \
.master("yarn") \
.appName('dataproc-job-read-azure') \
.getOrCreate()
spark.conf.set("fs.azure.account.key."+ acct + ".blob.core.windows.net ",ssk)
return spark
def read_data_distribute_files(spark):
print('************* read_data_distribute_files *************')
print('The new format file is:\t {}'.format(format_file))
def main():
spark = create_spark_session()
printf_args()
read_data_distribute_files(spark)
if __name__ == "__main__":
main()
Dag.py
workflow = 'PROJNAME-OD-INC-SCHEMA_NAME-TABLE_NAME_FROM-ADLD'
config_file = 'projbaname-od-inc-schema_name-table_name-to-adld-hs_config.yaml'
config_path = 'incremental/schema_name/table_name/'
path = Path(__file__).with_name(config_file)
framework = framework.Framework(workflow, path)
sla_mins = 120
priority = 'P3'
azure_to_gcs = DataprocSubmitJobOperator(
task_id = 'azure_to_gcs',
job = {
'reference': {'project_id': 'wmt-mx-dl-core-dev'}, ###Modificar este
valor para PROD (wmt-mx-dl-core-prod)
'placement': {'cluster_name': conf_azure['cluster_name']},
'pyspark_job': {'main_python_file_uri': pyspark_uri,
'jar_file_uris': [jar_azure_storage,
jar_hadoop_storage],
'args':[conf_azure['azure_container'], #1 Contenedor
conf_azure['azure_account'], #2 Cuenta Azure
conf_azure['read_data_type'], #3 Tipo de dato
[ORC-PARQuET]
conf_azure['azure_input_path'], #4 Path en
Azure
conf_azure['gcp_output_path'], #5 Path en GCP
conf_azure['partition_column'], #6 Partición
conf_azure['delimiter'], #7 Delimitador
conf_azure['secret_key'] #8 Secret
Manager
]
}
},
region = 'us-east4', ###Modificar este calir para PROD
(us-central1)
project_id = 'wmt-mx-dl-core-dev', ###Modificar este valor para PROD
(wmt-mx-dl-core-prod)
gcp_conn_id = id_conn_id
)
return azure_to_gcs
#end_getParams
def failure_callback(context):
callbacks.failure_callback(context, config_path, workflow, priority)
def sla_miss_callback(context):
callbacks.sla_miss_callback(context, workflow, priority)
Azure JSON
Config Yaml
## Task 1
- name: "start"
type: "START"
description: ""
long_name: "start"
curr_id: 1
properties: {}
## Task 2
- name: "to_raw"
type: "HIVE"
description: "Load the data from GCS into external table"
long_name: "02_inc_schema_name_table_name_to_raw"
curr_id: 2
parents:
- "start"
properties_file: "02_inc_schema_name_table_name_to_raw.yaml"
## Task 3
- name: "end"
type: "END"
description: ""
long_name: "end"
curr_id: 3
parents:
- "to_raw"
properties:
done_file_path: "$done_bucket/$target_schema/$target_table/$geo_region_cd"
done_file_name: "$target_schema_$target_table_YYYYMMddHHmmss.done"
properties.yaml
---
cmpny_cd: "WMT-MX"
comp_name: "MX"
date_end: "<de>"
date_start: "<ds>"
division_code: "all_banners"
domain: "IBP"
emails: "aelermail@walmart.com"
geo_region_cd: "mx"
hadoop_engine: "tez"
hadoop_queue: "default"
load_type: "incremental"
priority: "P3"
project_name: "IBP_INT_DATOS"
schedule: "None"
sla_mins: "120"
tags: "IBP,P1,MISC"
target_schema: "schema_name"
target_table: "table_name"
user: "svcmxhs"
v_group: "mxschs"
v_permissions: "750"
archive_bucket: "gs://raw_bucket_hash"
raw_bucket: "gs://raw_bucket_hash"
stage_bucket: "gs://stg_bucket hash"
target_bucket: "gs://catalog_bucket_hash"
done_bucket: "gs://done_bucket_hash"