Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                

Example Import GCP To ADLS

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 7

Example Import GCP to ADLS

Spark ADLS to GCP

###Nota: Este archivo debe de ir en la ruta del repositorio core:


/scripts/global/azure_adls_to_gcp.py , funciona como una librerÃa por lo que solo
es necesario subirlo una vez

from pyspark.sql import SparkSession,SQLContext


from pyspark.sql.functions import col
import sys
import os
import subprocess as sp
from google.cloud import secretmanager
import json

# You'll find thi args in .properties file, created in .py files


arguments = sys.argv
# 1 Contenedor
container = arguments[1]
#2 Cuenta Azure
acct= arguments[2]
#3 Tipo de dato [ORC-PARQUET]
format_file = arguments[3].lower()
#4 Path en Azure
input_path = arguments[4]
#5 Path en GCP
output_path = arguments[5]
#6 Particion
partition_column = arguments[6]
#7 Delimitador
delimiter = arguments[7]
#8 Secret Manager
secret_key = arguments[8]

if (partition_column == "" ):
partition_column = 0

flagdelta = False

if format_file == 'delta':
flagdelta = True
format_file = 'parquet'

print("total arguments {}".format(len(arguments)))


gcp_secret_client = secretmanager.SecretManagerServiceClient()
secret_resource_name = f"projects/wmt-mx-dl-core-dev/secrets/" + secret_key +
"/versions/latest" #####Modificar este valor para PROD (wmt-mx-dl-core-prod)
response = gcp_secret_client.access_secret_version(name=secret_resource_name)
secret = json.loads(response.payload.data.decode('UTF-8') )
ssk = secret['key']

#OPTIONAL

def printf_args():
print('************* Variables to connect Databricks *************')
print('Container where container exists:\t {}'.format(container))
print('Container where acct exists:\t {}'.format(acct))
print('Format_file to read:\t {}'.format(format_file))
print('Path where file exists:\t {}'.format(input_path))
print('Path where file will exists in bucket:\t {}'.format(output_path))
print('Path partition_column :\t {}'.format(partition_column))
print('delimiter:\t {}'.format(delimiter))

# astro-variables-ADLS_MX_DOFF_IBP

def create_spark_session():
print ('Number of arguments:\t {}'.format(len(arguments)))
spark = SparkSession.builder \
.master("yarn") \
.appName('dataproc-job-read-azure') \
.getOrCreate()
spark.conf.set("fs.azure.account.key."+ acct + ".blob.core.windows.net ",ssk)

return spark

def read_data_distribute_files(spark):
print('************* read_data_distribute_files *************')
print('The new format file is:\t {}'.format(format_file))

print('******************** Reading dataframe ****************************')


if delimiter == "":
spark_df = spark.read.format(format_file).\
option('header', True).\
load('wasbs://' + container + '@' + acct + '.blob.core.windows.net/' +
input_path)
else:
spark_df = spark.read.format(format_file).\
option('header', True).\
option('delimiter', delimiter).\
load('wasbs://' + container + '@' + acct + '.blob.core.windows.net/' +
input_path)

print('******************** Show 15 records from source


****************************')
print(spark_df.show(15,False))

# in case of delta files


if ( flagdelta == True and partition_column != 0 ):

print('The partition column is:\t {}'.format(partition_column))


print('******************** Finding delta ****************************')
last_partition = spark_df.agg({partition_column: 'max'}).collect()[0]
['max({})'.format(partition_column)]
print('****** Delta is: {0} \t = \t {1}
******'.format(partition_column,last_partition))
spark_df = spark_df.filter(col(partition_column) == last_partition)
print('******************** Show 15 records from delta
****************************')
print(spark_df.show(15,False))

print('******************** Total records read ****************************')


print('Total records:\t {}'.format(spark_df.count()))
print('******************** Writing dataframe ****************************')
spark_df.write.format('orc').mode('overwrite').options(header='true').save(output_p
ath)
print('Writing table in path:\t {} to orc '.format(output_path))

def main():
spark = create_spark_session()
printf_args()
read_data_distribute_files(spark)

if __name__ == "__main__":
main()

Dag.py

# DAG file for mx_fcst_new_item_comb


from pathlib import Path
from airflow import DAG
from datetime import datetime, timedelta, date
from workflow_framework import framework
from workflow_framework.config import *
from workflow_framework import callbacks
from airflow.operators.python import BranchPythonOperator
from airflow.providers.google.cloud.operators.dataproc import
DataprocSubmitJobOperator
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
import json
import pendulum

workflow = 'PROJNAME-OD-INC-SCHEMA_NAME-TABLE_NAME_FROM-ADLD'
config_file = 'projbaname-od-inc-schema_name-table_name-to-adld-hs_config.yaml'
config_path = 'incremental/schema_name/table_name/'
path = Path(__file__).with_name(config_file)
framework = framework.Framework(workflow, path)
sla_mins = 120
priority = 'P3'

# Define workflow tasks


def azure_to_gcs_task():
gcs_code_bucket = Variable.get('GCS_CODE_BUCKET')
pyspark_uri = 'gs://' + gcs_code_bucket +
'/scripts/global/azure_adls_to_gcp.py'
jar_azure_storage = 'gs://' + gcs_code_bucket +
'/utilities/global_jars/azure_spark_jars/azure-storage-8.6.4.jar'
jar_hadoop_storage = 'gs://' + gcs_code_bucket +
'/utilities/global_jars/azure_spark_jars/hadoop-azure-3.2.1.jar'
id_conn_id = Variable.get('CONN_ID_HS')

###Las variables de conexión a ADLS se toman de un archivo de configuracion


JSON
config_file_azure = 'conf_azure.json'
path_azure = Path(__file__).with_name(config_file_azure)
f = open(path_azure)
conf_azure = json.load(f)

azure_to_gcs = DataprocSubmitJobOperator(
task_id = 'azure_to_gcs',
job = {
'reference': {'project_id': 'wmt-mx-dl-core-dev'}, ###Modificar este
valor para PROD (wmt-mx-dl-core-prod)
'placement': {'cluster_name': conf_azure['cluster_name']},
'pyspark_job': {'main_python_file_uri': pyspark_uri,
'jar_file_uris': [jar_azure_storage,
jar_hadoop_storage],
'args':[conf_azure['azure_container'], #1 Contenedor
conf_azure['azure_account'], #2 Cuenta Azure
conf_azure['read_data_type'], #3 Tipo de dato
[ORC-PARQuET]
conf_azure['azure_input_path'], #4 Path en
Azure
conf_azure['gcp_output_path'], #5 Path en GCP
conf_azure['partition_column'], #6 Partición
conf_azure['delimiter'], #7 Delimitador
conf_azure['secret_key'] #8 Secret
Manager
]
}
},
region = 'us-east4', ###Modificar este calir para PROD
(us-central1)
project_id = 'wmt-mx-dl-core-dev', ###Modificar este valor para PROD
(wmt-mx-dl-core-prod)
gcp_conn_id = id_conn_id
)
return azure_to_gcs
#end_getParams

def failure_callback(context):
callbacks.failure_callback(context, config_path, workflow, priority)

def sla_miss_callback(context):
callbacks.sla_miss_callback(context, workflow, priority)

# Create the DAG


with DAG(workflow,
default_args={
'retries': 2,
'email': 'mx-da-ibp@wal-mart.com',
'email_on_failure': True,
'email_on_retry': False,
'sla': timedelta(minutes=int(sla_mins)),
},
description = workflow,
schedule_interval = None,
start_date=pendulum.datetime(2023, 1, 1, tz='America/Mexico_City'),
catchup = False,
on_failure_callback = failure_callback,
sla_miss_callback = sla_miss_callback,
tags = ['P3', 'ADLS', 'supply chain'],
max_active_runs = 1
) as dag:

end = DummyOperator(task_id = 'end',dag = dag,trigger_rule =


'none_failed_min_one_success')
azure_to_gcs = azure_to_gcs_task()

start = framework.build_task('START', 'start')


create_params = framework.build_task('SCRIPT', 'create_params')
to_raw = framework.build_task('HIVE', 'to_raw')
end_load = framework.build_task('END', 'end')

# Setup the task flow


start >> create_params
create_params >> azure_to_gcs
azure_to_gcs >> to_raw
to_raw >> end

Azure JSON

{ "__comentario_cluster_name__" :"nombre del cluster",


"cluster_name" : "azure_cluster_name",
"azure_container" : "contenedor",
"azure_account" : "cuentaDeAzure",
"__comentario_read_data_type__" :"tipo de archivos a procesar, si no es delta, no
es necesario enviar la particion, ya que se realiza un maximo para traer el ultimo
delta",
"read_data_type" : "parquet",
"__comentario_azure_input_path__" : "ruta de descarga desde el contenedor de blob
storage",
"azure_input_path" : "path/source/in/adls/",
"__comentario_gcp_output_path__" : "ruta de escritura en GCP",
"gcp_output_path" :
"gs://778a4..zxzw/landing/incremental/schema_name/target_table" ,
"__comentario_partition_column__" :"particion a procesar, si no se ocupa se envia
0, para no interferir en los parametros de entrada (default 0) solo en el caso que
el tipo de archivo sea delta, debido a que la clase obtiene solo la particion
maxima",
"partition_column" : "PROCESSED_TIME",
"__comentario_delimiter__" : "delimitador en caso que se deba leer csv, si no se
envia vacio",
"delimiter" : "",
"__comentario_specific_filter__" : "Query para traer información especifica,
solo lo que va en el filter o where",
"specific_filter" : "SUBSTRING(PROCESSED_TIME,1,8) == YYYYMMDD",
"secret_key" : "astro-variables-SECRET_NAME"
}

Config Yaml

# Config file for intlmxecomm-mxall-banners-inc-local-highsecure-mx-fcst-new-item-


comb-load
---
name: "prodjname-od-inc-schema-name-table-name-from-adls"
version: "1.1"
cluster_name: "schema-name-table-name-adls-to-gcp"
cluster_profile: "medium"
data_group: "core_hs"
path: "incremental/schema_name/table_name/"
properties_file: "inc-schema_name-table_name_properties.yaml"
# Workflow Tasks
tasks:

## Task 1
- name: "start"
type: "START"
description: ""
long_name: "start"
curr_id: 1
properties: {}

## Task 2
- name: "to_raw"
type: "HIVE"
description: "Load the data from GCS into external table"
long_name: "02_inc_schema_name_table_name_to_raw"
curr_id: 2
parents:
- "start"
properties_file: "02_inc_schema_name_table_name_to_raw.yaml"

## Task 3
- name: "end"
type: "END"
description: ""
long_name: "end"
curr_id: 3
parents:
- "to_raw"
properties:
done_file_path: "$done_bucket/$target_schema/$target_table/$geo_region_cd"
done_file_name: "$target_schema_$target_table_YYYYMMddHHmmss.done"

properties.yaml

---
cmpny_cd: "WMT-MX"
comp_name: "MX"
date_end: "<de>"
date_start: "<ds>"
division_code: "all_banners"
domain: "IBP"
emails: "aelermail@walmart.com"
geo_region_cd: "mx"
hadoop_engine: "tez"
hadoop_queue: "default"
load_type: "incremental"
priority: "P3"
project_name: "IBP_INT_DATOS"
schedule: "None"
sla_mins: "120"
tags: "IBP,P1,MISC"
target_schema: "schema_name"
target_table: "table_name"
user: "svcmxhs"
v_group: "mxschs"
v_permissions: "750"
archive_bucket: "gs://raw_bucket_hash"
raw_bucket: "gs://raw_bucket_hash"
stage_bucket: "gs://stg_bucket hash"
target_bucket: "gs://catalog_bucket_hash"
done_bucket: "gs://done_bucket_hash"

You might also like