Applicable when
- AWS Glue is used to process data
Implementation
The code below will create a new Glue job based on Python job script
const databaseName = withEnv('catalog');
const database = new Database(this, databaseName, {
databaseName,
});
const scriptAsset = new Asset(this, withEnv('my-glue-job-script'), {
path: '<path_to_glue_job_script>',
});
const jobRole = new Role(this, withEnv('glue-role'), {
roleName: withEnv('glue-role'),
assumedBy: new ServicePrincipal('glue.amazonaws.com'),
inlinePolicies: {
allowS3ReadAndWrite: PolicyDocument.fromJson({
Version: '2012-10-17',
Statement: [
{
Effect: 'Allow',
Action: ['s3:ListBucket', 's3:GetObject', 's3:PutObject', 's3:DeleteObject'],
Resource: 'arn:aws:s3:::*',
},
],
}),
allowGlueActions: PolicyDocument.fromJson({
Version: '2012-10-17',
Statement: [
{
Effect: 'Allow',
Action: 'glue:*',
Resource: '*',
},
],
}),
allowLogging: PolicyDocument.fromJson({
Version: '2012-10-17',
Statement: [
{
Effect: 'Allow',
Action: 'logs:*',
Resource: '*',
},
],
}),
},
});
const jobName = withEnv('my-glue-job');
const glueJob = new CfnJob(this, withEnv(jobName), {
name: jobName,
role: jobRole.roleName,
command: {
name: 'glueetl',
pythonVersion: '3',
scriptLocation: scriptAsset.s3ObjectUrl,
}, // Change timeout based on the nature of job
// (set min required to prevent unnecessary charges incurred)
timeout: 30,
glueVersion: '1.0',
defaultArguments: {
'--GLUE_DB_NAME': database.databaseName, // Pass parameters like this
'--enable-continuous-cloudwatch-log': true, // Enable logging
},
executionProperty: {
maxConcurrentRuns: 10, // Allow parallel runs
},
});
Comments
0 comments
Please sign in to leave a comment.