import getopt import json import sys from collections import OrderedDict dict = {} dupliacteUid = {} # debugFlag = True debugFlag = False def join_strings(lst): concat = "" for string in lst: if string is None: if type(string) == int: string = str(string) concat += (string + " ") return concat def debug(desc, *args): 'print only if debug enabled' if debugFlag: print(desc, join_strings(args)) def log(desc, arg): 'print log info' print(desc, arg) def getUid(vertex): uid = None nodeLabel = vertex.get('nodeLabel') debug(nodeLabel) if nodeLabel == 'user': uid = vertex['userId'] elif nodeLabel == 'tag': uid = vertex['name'] elif nodeLabel is None: pass elif nodeLabel == 'lockNode': uid = vertex.get('uid') else: uid = vertex['uid'] debug(nodeLabel, uid) return uid def generateFile(inputFile, outputFile): with open(inputFile) as json_file: dupliacteUid = {} json_data = json.load(json_file) for x in json_data['vertices']: uid = getUid(x) existId = dict.get(uid) if existId is None: dict[uid] = x.get('_id') else: dupliacteUid[uid] = existId log("duplicate ids", dupliacteUid) json_data_vertices = json_data['vertices'] log("number of vertices is", len(json_data_vertices)) ids = {} deleteIndexes = [] for i in xrange(len(json_data_vertices)): # print "****** ", i, " *************" # print json_data_vertices[i] id = json_data_vertices[i]["_id"] uid = getUid(json_data_vertices[i]) isDuplicateId = dupliacteUid.get(uid) if isDuplicateId is not None: debug("uid to id pair", uid if uid != None else 'None', id) value = ids.get(uid) if value is None: list = [id, ] ids[uid] = list else: value.append(id) deleteIndexes.append(id) log("ids", ids) log("deleteIndexes", deleteIndexes) log("deleteIndexes size", len(deleteIndexes)) filter_vertex = [x for x in json_data_vertices if x.get('_id') not in deleteIndexes] json_data['vertices'] = filter_vertex log("number of vertexes after filter", len(filter_vertex)) json_data_edges = json_data['edges'] log("number of edges", len(json_data_edges)) filter_edge = [x for x in json_data_edges if x['_outV'] not in deleteIndexes and x['_inV'] not in deleteIndexes] json_data['edges'] = filter_edge log("number of edges after filter", len(json_data['edges'])) json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True)) with open(outputFile, 'w') as outfile: # json.dump(json_data, outfile) json.dump(json_data, outfile) log("output file is", outputFile) def main(argv): print('Number of arguments:', len(sys.argv), 'arguments.') inputfile = None outputfile = '' try: opts, args = getopt.getopt(argv, "h:i:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print(sys.argv[0], '-i ') sys.exit(2) for opt, arg in opts: if opt == '-h': print(sys.argv[0], '-i ') sys.exit(3) elif opt in ("-i", "--ifile"): inputfile = arg if (inputfile == None): print(sys.argv[0], '-i ') sys.exit(3) print('Input file is "', inputfile) generateFile(inputfile, inputfile + '.noduplicates') if __name__ == "__main__": main(sys.argv[1:]) # print x['uid']