diff options
Diffstat (limited to 'asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py')
-rw-r--r-- | asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py | 238 |
1 files changed, 123 insertions, 115 deletions
diff --git a/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py b/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py index a4bd35dd2b..086ab926ba 100644 --- a/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py +++ b/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py @@ -1,136 +1,144 @@ +import getopt import json -import sys, getopt +import sys from collections import OrderedDict -dict = {} -dupliacteUid = {} -#debugFlag = True -debugFlag = False +dict = {} +dupliacteUid = {} +# debugFlag = True +debugFlag = False + def join_strings(lst): concat = "" for string in lst: - if (string != None): - if (type(string) == int): - string = str(string) - concat += (string + " ") + if string is None: + if type(string) == int: + string = str(string) + concat += (string + " ") return concat + def debug(desc, *args): - 'print only if debug enabled' - if (debugFlag == True): - print desc, join_strings(args) + 'print only if debug enabled' + if debugFlag: + print(desc, join_strings(args)) + def log(desc, arg): - 'print log info' - print desc, arg + 'print log info' + print(desc, arg) + def getUid(vertex): - uid = None - nodeLabel=vertex.get('nodeLabel') - debug(nodeLabel) - if ( nodeLabel == 'user' ): - uid = vertex['userId'] - elif ( nodeLabel == 'tag' ): - uid = vertex['name'] - elif ( nodeLabel == None ): - pass - elif ( nodeLabel == 'lockNode' ): - uid = vertex.get('uid') - else: uid = vertex['uid'] - - debug(nodeLabel, uid) - - return uid + uid = None + nodeLabel = vertex.get('nodeLabel') + debug(nodeLabel) + if nodeLabel == 'user': + uid = vertex['userId'] + elif nodeLabel == 'tag': + uid = vertex['name'] + elif nodeLabel is None: + pass + elif nodeLabel == 'lockNode': + uid = vertex.get('uid') + else: + uid = vertex['uid'] + + debug(nodeLabel, uid) + + return uid + def generateFile(inputFile, outputFile): - - with open(inputFile) as json_file: - dupliacteUid = {} - json_data = json.load(json_file) - for x in json_data['vertices']: - uid = getUid(x) - - existId = dict.get(uid) - if (existId == None): - dict[uid] = x.get('_id') - else: - dupliacteUid[uid] = existId - - log("duplicate ids", dupliacteUid) - - json_data_vertices = json_data['vertices'] - log("number of vertices is", len(json_data_vertices)) - - ids = {} - deleteIndexes = [] - - for i in xrange(len(json_data_vertices)): - #print "****** ", i, " *************" - #print json_data_vertices[i] - id = json_data_vertices[i]["_id"] - uid = getUid(json_data_vertices[i]) - isDuplicateId = dupliacteUid.get(uid) - if (isDuplicateId != None): - debug("uid to id pair", uid if uid != None else 'None', id) - value = ids.get(uid) - if (value == None): - list = [id,] - ids[uid] = list - else: - value.append(id) - deleteIndexes.append(id) - - log("ids", ids) - log("deleteIndexes", deleteIndexes) - log("deleteIndexes size", len(deleteIndexes)) - - filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ] - json_data['vertices'] = filter_vertex - - log("number of vertexes after filter", len(filter_vertex)) - - json_data_edges = json_data['edges'] - - log("number of edges", len(json_data_edges)) - - filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ] - json_data['edges'] = filter_edge - - log("number of edges after filter", len(json_data['edges'])) - - json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True)) - - with open(outputFile, 'w') as outfile: - #json.dump(json_data, outfile) - json.dump(json_data, outfile) - log("output file is", outputFile); + with open(inputFile) as json_file: + dupliacteUid = {} + json_data = json.load(json_file) + for x in json_data['vertices']: + uid = getUid(x) + + existId = dict.get(uid) + if existId is None: + dict[uid] = x.get('_id') + else: + dupliacteUid[uid] = existId + + log("duplicate ids", dupliacteUid) + + json_data_vertices = json_data['vertices'] + log("number of vertices is", len(json_data_vertices)) + + ids = {} + deleteIndexes = [] + + for i in xrange(len(json_data_vertices)): + # print "****** ", i, " *************" + # print json_data_vertices[i] + id = json_data_vertices[i]["_id"] + uid = getUid(json_data_vertices[i]) + isDuplicateId = dupliacteUid.get(uid) + if isDuplicateId is not None: + debug("uid to id pair", uid if uid != None else 'None', id) + value = ids.get(uid) + if value is None: + list = [id, ] + ids[uid] = list + else: + value.append(id) + deleteIndexes.append(id) + + log("ids", ids) + log("deleteIndexes", deleteIndexes) + log("deleteIndexes size", len(deleteIndexes)) + + filter_vertex = [x for x in json_data_vertices if x.get('_id') not in deleteIndexes] + json_data['vertices'] = filter_vertex + + log("number of vertexes after filter", len(filter_vertex)) + + json_data_edges = json_data['edges'] + + log("number of edges", len(json_data_edges)) + + filter_edge = [x for x in json_data_edges if + x['_outV'] not in deleteIndexes and x['_inV'] not in deleteIndexes] + json_data['edges'] = filter_edge + + log("number of edges after filter", len(json_data['edges'])) + + json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True)) + + with open(outputFile, 'w') as outfile: + # json.dump(json_data, outfile) + json.dump(json_data, outfile) + log("output file is", outputFile) + def main(argv): - print 'Number of arguments:', len(sys.argv), 'arguments.' - inputfile = None - outputfile = '' - try: - opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="]) - except getopt.GetoptError: - print sys.argv[0], '-i <inputfile>' - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print sys.argv[0], '-i <inputfile>' - sys.exit(3) - elif opt in ("-i", "--ifile"): - inputfile = arg - - if ( inputfile == None ): - print sys.argv[0] ,'-i <inputfile>' - sys.exit(3) - - print 'Input file is "', inputfile - generateFile(inputfile, inputfile + '.noduplicates') - + print('Number of arguments:', len(sys.argv), 'arguments.') + inputfile = None + outputfile = '' + try: + opts, args = getopt.getopt(argv, "h:i:o:", ["ifile=", "ofile="]) + except getopt.GetoptError: + print(sys.argv[0], '-i <inputfile>') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print(sys.argv[0], '-i <inputfile>') + sys.exit(3) + elif opt in ("-i", "--ifile"): + inputfile = arg + + if (inputfile == None): + print(sys.argv[0], '-i <inputfile>') + sys.exit(3) + + print('Input file is "', inputfile) + generateFile(inputfile, inputfile + '.noduplicates') + if __name__ == "__main__": - main(sys.argv[1:]) - + main(sys.argv[1:]) + # print x['uid'] |