1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
import json
import sys, getopt
from collections import OrderedDict
dict = {}
dupliacteUid = {}
#debugFlag = True
debugFlag = False
def join_strings(lst):
concat = ""
for string in lst:
if (string != None):
if (type(string) == int):
string = str(string)
concat += (string + " ")
return concat
def debug(desc, *args):
'print only if debug enabled'
if (debugFlag == True):
print desc, join_strings(args)
def log(desc, arg):
'print log info'
print desc, arg
def getUid(vertex):
uid = None
nodeLabel=vertex.get('nodeLabel')
debug(nodeLabel)
if ( nodeLabel == 'user' ):
uid = vertex['userId']
elif ( nodeLabel == 'tag' ):
uid = vertex['name']
elif ( nodeLabel == None ):
pass
elif ( nodeLabel == 'lockNode' ):
uid = vertex.get('uid')
else: uid = vertex['uid']
debug(nodeLabel, uid)
return uid
def generateFile(inputFile, outputFile):
with open(inputFile) as json_file:
dupliacteUid = {}
json_data = json.load(json_file)
for x in json_data['vertices']:
uid = getUid(x)
existId = dict.get(uid)
if (existId == None):
dict[uid] = x.get('_id')
else:
dupliacteUid[uid] = existId
log("duplicate ids", dupliacteUid)
json_data_vertices = json_data['vertices']
log("number of vertices is", len(json_data_vertices))
ids = {}
deleteIndexes = []
for i in xrange(len(json_data_vertices)):
#print "****** ", i, " *************"
#print json_data_vertices[i]
id = json_data_vertices[i]["_id"]
uid = getUid(json_data_vertices[i])
isDuplicateId = dupliacteUid.get(uid)
if (isDuplicateId != None):
debug("uid to id pair", uid if uid != None else 'None', id)
value = ids.get(uid)
if (value == None):
list = [id,]
ids[uid] = list
else:
value.append(id)
deleteIndexes.append(id)
log("ids", ids)
log("deleteIndexes", deleteIndexes)
log("deleteIndexes size", len(deleteIndexes))
filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ]
json_data['vertices'] = filter_vertex
log("number of vertexes after filter", len(filter_vertex))
json_data_edges = json_data['edges']
log("number of edges", len(json_data_edges))
filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ]
json_data['edges'] = filter_edge
log("number of edges after filter", len(json_data['edges']))
json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True))
with open(outputFile, 'w') as outfile:
#json.dump(json_data, outfile)
json.dump(json_data, outfile)
log("output file is", outputFile);
def main(argv):
print 'Number of arguments:', len(sys.argv), 'arguments.'
inputfile = None
outputfile = ''
try:
opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="])
except getopt.GetoptError:
print sys.argv[0], '-i <inputfile>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print sys.argv[0], '-i <inputfile>'
sys.exit(3)
elif opt in ("-i", "--ifile"):
inputfile = arg
if ( inputfile == None ):
print sys.argv[0] ,'-i <inputfile>'
sys.exit(3)
print 'Input file is "', inputfile
generateFile(inputfile, inputfile + '.noduplicates')
if __name__ == "__main__":
main(sys.argv[1:])
# print x['uid']
|