summaryrefslogtreecommitdiffstats
path: root/scripts/elasticsearch/prepareElasticSearchBulkImport.pl
blob: e3fc6904c4a1a30b9f7e6bb16e738800c4e249a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/perl

use strict;
use warnings;

my $filename = $ARGV[0];
my $outputfile= $ARGV[1];

open my $fh_input, '<', $filename or die "Cannot open $filename: $!";
open my $fh_output, '>', $outputfile or die "Cannot open $outputfile: $!";

while ( my $line = <$fh_input> ) {
    chomp ($line);

    if ( $line =~ /(.*)(\".*\")(.*)/ ) {

       # we have seen examples of the status field containing quoted comma-delimited
       # strings which is messing up parsing of the record data which is supposed to be
       # comma-separated at the field level.  This little block converts sections of
       # this type of data into a single-quoted-string with a semi-colon delimiter instead.

       my $beforeBadStr = $1;
       my $badStr       = $2;
       my $afterBadStr  = $3;

       $badStr =~ s/,/;/g;
       $badStr =~ s/"/'/g;

       $line = $beforeBadStr . $badStr . $afterBadStr ;

    }

    my @row = split(",", $line);
    print $fh_output "{\"index\":{\"_index\":\"auditdata\",\"_type\":\"default\"}\n";
    print $fh_output "{\"entityType\": \"$row[0]\", \"errorMessage\": \"$row[1]\", \"violations\": [{ \"violationTimestamp\": \"$row[2]\", \"severity\": \"$row[3]\", \"violationType\": \"$row[4]\", \"violationDetails\": { \"MISSING_REL\": \"$row[5]\", \"entityType\": \"$row[6]\", \"entityId\": { \"vdc-id\": \"$row[7]\" } }, \"category\": \"$row[8]\" }, { \"violationTimestamp\": \"$row[9]\", \"severity\": \"$row[10]\", \"violationType\": \"$row[11]\", \"violationDetails\": { \"MISSING_REL\": \"$row[12]\", \"entityType\": \"$row[13]\", \"entityId\": { \"vdc-id\": \"$row[14]\" } }, \"category\": \"$row[15]\" }]}\n";

}

close($fh_input);
close($fh_output);