diff options
Diffstat (limited to 'asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py')
-rw-r--r-- | asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py b/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py new file mode 100644 index 0000000000..a4bd35dd2b --- /dev/null +++ b/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py @@ -0,0 +1,136 @@ +import json +import sys, getopt +from collections import OrderedDict + +dict = {} +dupliacteUid = {} +#debugFlag = True +debugFlag = False + +def join_strings(lst): + concat = "" + for string in lst: + if (string != None): + if (type(string) == int): + string = str(string) + concat += (string + " ") + return concat + +def debug(desc, *args): + 'print only if debug enabled' + if (debugFlag == True): + print desc, join_strings(args) + +def log(desc, arg): + 'print log info' + print desc, arg + +def getUid(vertex): + uid = None + nodeLabel=vertex.get('nodeLabel') + debug(nodeLabel) + if ( nodeLabel == 'user' ): + uid = vertex['userId'] + elif ( nodeLabel == 'tag' ): + uid = vertex['name'] + elif ( nodeLabel == None ): + pass + elif ( nodeLabel == 'lockNode' ): + uid = vertex.get('uid') + else: uid = vertex['uid'] + + debug(nodeLabel, uid) + + return uid + +def generateFile(inputFile, outputFile): + + with open(inputFile) as json_file: + dupliacteUid = {} + json_data = json.load(json_file) + for x in json_data['vertices']: + uid = getUid(x) + + existId = dict.get(uid) + if (existId == None): + dict[uid] = x.get('_id') + else: + dupliacteUid[uid] = existId + + log("duplicate ids", dupliacteUid) + + json_data_vertices = json_data['vertices'] + log("number of vertices is", len(json_data_vertices)) + + ids = {} + deleteIndexes = [] + + for i in xrange(len(json_data_vertices)): + #print "****** ", i, " *************" + #print json_data_vertices[i] + id = json_data_vertices[i]["_id"] + uid = getUid(json_data_vertices[i]) + isDuplicateId = dupliacteUid.get(uid) + if (isDuplicateId != None): + debug("uid to id pair", uid if uid != None else 'None', id) + value = ids.get(uid) + if (value == None): + list = [id,] + ids[uid] = list + else: + value.append(id) + deleteIndexes.append(id) + + log("ids", ids) + log("deleteIndexes", deleteIndexes) + log("deleteIndexes size", len(deleteIndexes)) + + filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ] + json_data['vertices'] = filter_vertex + + log("number of vertexes after filter", len(filter_vertex)) + + json_data_edges = json_data['edges'] + + log("number of edges", len(json_data_edges)) + + filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ] + json_data['edges'] = filter_edge + + log("number of edges after filter", len(json_data['edges'])) + + json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True)) + + with open(outputFile, 'w') as outfile: + #json.dump(json_data, outfile) + json.dump(json_data, outfile) + log("output file is", outputFile); + +def main(argv): + print 'Number of arguments:', len(sys.argv), 'arguments.' + inputfile = None + outputfile = '' + try: + opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="]) + except getopt.GetoptError: + print sys.argv[0], '-i <inputfile>' + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print sys.argv[0], '-i <inputfile>' + sys.exit(3) + elif opt in ("-i", "--ifile"): + inputfile = arg + + if ( inputfile == None ): + print sys.argv[0] ,'-i <inputfile>' + sys.exit(3) + + print 'Input file is "', inputfile + generateFile(inputfile, inputfile + '.noduplicates') + + +if __name__ == "__main__": + main(sys.argv[1:]) + +# print x['uid'] |