From 06bf4cdcedf782af4f8ff2485f0ad3bc97b08c73 Mon Sep 17 00:00:00 2001 From: karbon Date: Mon, 15 Mar 2021 10:43:25 +0800 Subject: feat:NLP for IBN in UUI NLP for IBN in UUI Issue-ID: USECASEUI-525 Signed-off-by: karbon Change-Id: I9f26312748ebaedb3115035c4af8b0abf19d59bc --- LICENSE | 15 ++++ README.md | 97 ++++++------------------ assembly.xml | 65 ++++++++++++++++ initialize.sh | 16 ++++ mvn-phase-script.sh | 86 +++++++++++++++++++++ pom.xml | 113 ++++++++++++++++++++++++++++ releases/1.0.0-container.yaml | 10 +++ requirements.txt | 4 + scripts/api_squad.py | 45 ++++------- scripts/api_squad_offline.py | 48 ++++++------ scripts/api_squad_online.py | 39 ++-------- scripts/create_squad_features.py | 157 ++++++++++++++++----------------------- scripts/global_setting.py | 31 ++++---- test_1.py | 24 ++++++ tox.ini | 26 +++++++ version.properties | 27 +++++++ 16 files changed, 535 insertions(+), 268 deletions(-) create mode 100644 LICENSE create mode 100644 assembly.xml create mode 100644 initialize.sh create mode 100644 mvn-phase-script.sh create mode 100644 pom.xml create mode 100644 releases/1.0.0-container.yaml create mode 100644 requirements.txt create mode 100644 test_1.py create mode 100644 tox.ini create mode 100644 version.properties diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8332271 --- /dev/null +++ b/LICENSE @@ -0,0 +1,15 @@ +# Copyright 2017 CTC, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Micro service of NLP. diff --git a/README.md b/README.md index 40c498b..2b3289f 100644 --- a/README.md +++ b/README.md @@ -1,74 +1,23 @@ -# Usecase UI NLP - -This is the NLP processing part of Usecase UI project. - - -### Project Structure - - -### Git commit message rules - -PLEASE obey [AngularJS Git Commit Message Conventions](https://docs.google.com/document/d/1QrDFcIiPjSLDn3EL15IJygNPiHORgU1_OOAqWjiDU5Y/edit#) when write the commit messages. One commit message should include three parts: `Header`, `Body` and `Footer`. The `Header` part is necessary, and the other two parts are optional. The `Header` part follows the rule as: `(): `. `type` and `subject` are necessary, `scope` is optional. Only 7 tokens are allowed for `type`: - * feat: new features(feature) - * fix: fix bugs - * docs: documentation - * style: style - * refactor:reconstruction - * test:add test - * chore:change for construction and assistant tools - -For example: - -```bash -feat(directive): ng:disabled, ng:checked, ng:multiple, ng:readonly, ng:selected ------------------------------------------------------ -docs(guide): updated fixed docs from Google Docs - -Couple of typos fixed: -- indentation -- batchLogbatchLog -> batchLog -- start periodic checking -- missing brace -``` - - -When there is breaking changes in the project, please write the commit message in `Footer`. For example: - -```bash - BREAKING CHANGE: isolate scope bindings definition has changed. - - To migrate the code follow the example below: - - Before: - - scope: { - myAttr: 'attribute', - } - - After: - - scope: { - myAttr: '@', - } - - The removed `inject` wasn't generaly useful for directives so there should be no code using it. -``` -Also, run the script `npm run changelog` can generate all the `feat` and `fix` commits. Click [CHANGELOG.md](./CHANGELOG.md) shows all these commit histories. - - - -### Contributor - -``` -Copyright 2021 China Telecom. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -``` +# Copyright 2017 CTC, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Micro service of NLP. + +1. Code structure guide + ./ project files + ./assembly docker related scripts + ./logs log file + ./scripts NLP service + + diff --git a/assembly.xml b/assembly.xml new file mode 100644 index 0000000..011ac9e --- /dev/null +++ b/assembly.xml @@ -0,0 +1,65 @@ + + + nlp + + zip + + + + scripts + /scripts + + **/*.py + **/*.json + **/*.xml + **/*.yml + **/*.sh + + + + resources + /resources + + **/*.sh + **/*.sql + + + + assembly + /assembly + + *.sh + dockerfile + requirements + + + + . + / + + *.py + *.txt + *.sh + *.ini + *.md + + + + usecase-ui/nlp + diff --git a/initialize.sh b/initialize.sh new file mode 100644 index 0000000..ba50ed0 --- /dev/null +++ b/initialize.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Copyright 2017 , Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pip install -r requirements.txt diff --git a/mvn-phase-script.sh b/mvn-phase-script.sh new file mode 100644 index 0000000..be7f566 --- /dev/null +++ b/mvn-phase-script.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2018 CTC , Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +set -e + +echo "running script: [$0] for module [$1] at stage [$2]" + +export SETTINGS_FILE=${SETTINGS_FILE:-$HOME/.m2/settings.xml} +MVN_PROJECT_MODULEID="$1" +MVN_PHASE="$2" + + +FQDN="${MVN_PROJECT_GROUPID}.${MVN_PROJECT_ARTIFACTID}" +if [ "$MVN_PROJECT_MODULEID" == "__" ]; then + MVN_PROJECT_MODULEID="" +fi + +if [ -z "$WORKSPACE" ]; then + WORKSPACE=$(pwd) +fi + + +# mvn phase in life cycle +MVN_PHASE="$2" + + +echo "MVN_PROJECT_MODULEID is [$MVN_PROJECT_MODULEID]" +echo "MVN_PHASE is [$MVN_PHASE]" +echo "MVN_PROJECT_GROUPID is [$MVN_PROJECT_GROUPID]" +echo "MVN_PROJECT_ARTIFACTID is [$MVN_PROJECT_ARTIFACTID]" +echo "MVN_PROJECT_VERSION is [$MVN_PROJECT_VERSION]" + +run_tox_test() +{ + set -x + CURDIR=$(pwd) + if [[ ${CURDIR} =~ "-sonar" ]] + then + echo "====Sonar job, need execute tox." + TOXINIS=$(find . -name "tox.ini") + for TOXINI in "${TOXINIS[@]}"; do + DIR=$(echo "$TOXINI" | rev | cut -f3- -d'/' | rev) + cd "${CURDIR}/${DIR}" + rm -rf ./venv-tox ./.tox + virtualenv ./venv-tox + source ./venv-tox/bin/activate + pip install --upgrade pip + pip install --upgrade tox argparse + pip freeze + tox + deactivate + rm -rf ./venv-tox ./.tox + done + else + echo "====Not a sonar job, need not execute tox." + fi +} + + +case $MVN_PHASE in +clean) + echo "==> clean phase script" + rm -rf ./venv-* + ;; +test) + echo "==> test phase script" + run_tox_test + ;; +*) + echo "==> unprocessed phase" + ;; +esac + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..6bcb45e --- /dev/null +++ b/pom.xml @@ -0,0 +1,113 @@ + + + + + org.onap.oparent + oparent + 2.1.0 + + 4.0.0 + org.onap.usecase-ui + nlp + 1.0.10-SNAPSHOT + pom + nlp + usecase-ui nlp + + UTF-8 + . + xunit-results.xml + coverage.xml + py + python + **/**.py + **/tests/**.py,**/test*.py + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + ${project.basedir}/mvn-phase-script.sh + + + ${project.groupId} + ${project.artifactId} + ${project.version} + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + + clean phase script + clean + + exec + + + + __ + clean + + + + + test script + test + + exec + + + + __ + test + + + + + + + maven-assembly-plugin + + false + + assembly.xml + + + + + make-assembly + package + + single + + + + + + + diff --git a/releases/1.0.0-container.yaml b/releases/1.0.0-container.yaml new file mode 100644 index 0000000..002cb73 --- /dev/null +++ b/releases/1.0.0-container.yaml @@ -0,0 +1,10 @@ +distribution_type: 'container' +container_release_tag: '1.0.0' +project: 'uui-nlp' +container_pull_registry: nexus3.onap.org:10003 +container_push_registry: nexus3.onap.org:10002 +log_dir: 'uui-nlp-master-docker-java-version-shell-daily/206/' +ref: '5ff788c006f00be38b80edf80945f052ffe4a62b' +containers: + - name: 'uui/nlp' + version: '1.0.0-STAGING-20200320T085153' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..024d1b4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pytest +entrypoints==0.3 +Flask==1.1.1 +coverage diff --git a/scripts/api_squad.py b/scripts/api_squad.py index 239bbd6..f29a74b 100644 --- a/scripts/api_squad.py +++ b/scripts/api_squad.py @@ -1,6 +1,6 @@ # coding=utf-8 # squad interface -# Required parameters: +# Required parameters # FLAGS_output_dir :the output path of the model training during training process, the output of the trained model, etc.; the output path of the model prediction during predicting process # FLAGS_init_checkpoint_squad : model initialization path, use bert pre-trained model for training; use the output path during training for prediction # FLAGS_predict_file : the file to be predicted, csv file @@ -22,18 +22,13 @@ from __future__ import print_function import collections import json import math -import os -import random import modeling import optimization import tokenization import six import tensorflow as tf import pandas as pd -from global_setting import FLAGS_bert_config_file, FLAGS_vocab_file, FLAGS_init_checkpoint_squad - - - +from global_setting import FLAGS_init_checkpoint_squad FLAGS_max_seq_length = 512 FLAGS_do_lower_case = True @@ -53,11 +48,12 @@ FLAGS_warmup_proportion = 0.1 FLAGS_gcp_project = None FLAGS_null_score_diff_threshold = 0.0 -def make_json(input_file,questions): + +def make_json(input_file, questions): print(input_file) data_train = pd.read_excel(input_file) print(444) - data_train.fillna(0,inplace=True) + data_train.fillna(0, inplace=True) data_train.index = [i for i in range(len(data_train))] question = questions res = {} @@ -67,17 +63,16 @@ def make_json(input_file,questions): data_inside['title'] = 'Not available' data_inside['paragraphs'] = [] paragraphs_inside = {} - paragraphs_inside['context'] = data_train.loc[i,'text'] + paragraphs_inside['context'] = data_train.loc[i, 'text'] paragraphs_inside['qas'] = [] - for ques in question: + for ques in question: qas_inside = {} qas_inside['answers'] = [] - if data_train.loc[i,ques]: + if data_train.loc[i, ques]: answer_inside = {} - answer_inside['text'] = str(data_train.loc[i,ques]) + answer_inside['text'] = str(data_train.loc[i, ques]) answer_inside['answer_start'] = paragraphs_inside['context'].find(answer_inside['text']) qas_inside['is_impossible'] = 0 - else: qas_inside['is_impossible'] = 1 answer_inside = {} @@ -92,8 +87,6 @@ def make_json(input_file,questions): return json.dumps(res) - - class SquadExample(object): """A single training/test example for simple sequence classification. @@ -164,9 +157,9 @@ class InputFeatures(object): self.is_impossible = is_impossible -def read_squad_examples(input_file, is_training,questions,FLAGS_version_2_with_negative): +def read_squad_examples(input_file, is_training, questions, FLAGS_version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" - data = make_json(input_file,questions) + data = make_json(input_file, questions) input_data = json.loads(data)["data"] def is_whitespace(c): @@ -212,8 +205,7 @@ def read_squad_examples(input_file, is_training,questions,FLAGS_version_2_with_n answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - - 1] + end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. @@ -353,8 +345,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 @@ -544,7 +535,6 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate, tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] @@ -686,7 +676,7 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file): + output_nbest_file, output_null_log_odds_file, FLAGS_version_2_with_negative): """Write final predictions to the json file and log-odds of null if needed.""" tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) @@ -705,7 +695,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] @@ -713,9 +702,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min mull score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) @@ -726,9 +712,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict diff --git a/scripts/api_squad_offline.py b/scripts/api_squad_offline.py index 1c98a10..8a05141 100644 --- a/scripts/api_squad_offline.py +++ b/scripts/api_squad_offline.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # coding: utf-8 @@ -9,25 +8,36 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import json -import datetime -import threading -import time from flask import Flask, abort, request, jsonify from concurrent.futures import ThreadPoolExecutor -import collections -import math import os import random import modeling -import optimization import tokenization -import six import tensorflow as tf import sys -from api_squad import * -from global_setting import * + +from api_squad import FLAGS_max_seq_length +from api_squad import FLAGS_do_lower_case +from api_squad import FLAGS_use_tpu +from api_squad import FLAGS_tpu_name +from api_squad import FLAGS_tpu_zone +from api_squad import FLAGS_gcp_project +from api_squad import FLAGS_master +from api_squad import FLAGS_save_checkpoints_steps +from api_squad import FLAGS_iterations_per_loop +from api_squad import FLAGS_num_tpu_cores +from api_squad import FLAGS_warmup_proportion +from api_squad import FLAGS_doc_stride +from api_squad import model_fn_builder +from api_squad import FeatureWriter +from api_squad import convert_examples_to_features +from api_squad import input_fn_builder + +from global_setting import CUDA_VISIBLE_DEVICES +from global_setting import validate_flags_or_throw +from global_setting import read_squad_examples from global_setting import FLAGS_bert_config_file, FLAGS_vocab_file, FLAGS_init_checkpoint_squad, questions os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" @@ -35,9 +45,10 @@ os.environ["CUDA_VISIBLE_DEVICES"] = str(CUDA_VISIBLE_DEVICES) app = Flask(__name__) + def serving_input_fn(): input_ids = tf.placeholder(tf.int32, [None, FLAGS_max_seq_length], name='input_ids') - unique_id = tf.placeholder(tf.int32,[None]) + unique_id = tf.placeholder(tf.int32, [None]) input_mask = tf.placeholder(tf.int32, [None, FLAGS_max_seq_length], name='input_mask') segment_ids = tf.placeholder(tf.int32, [None, FLAGS_max_seq_length], name='segment_ids') input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ @@ -45,13 +56,13 @@ def serving_input_fn(): 'input_mask': input_mask, 'segment_ids': segment_ids, 'unique_ids': unique_id, - })() + })() return input_fn + def main(FLAGS_output_dir, FLAGS_init_checkpoint_squad, FLAGS_export_dir, FLAGS_predict_file=None, FLAGS_train_file=None, FLAGS_do_predict=False, FLAGS_do_train=False, FLAGS_train_batch_size=16, FLAGS_predict_batch_size=8, FLAGS_learning_rate=5e-5, FLAGS_num_train_epochs=3.0, FLAGS_max_answer_length=100, FLAGS_max_query_length=64, FLAGS_version_2_with_negative=False): - tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS_bert_config_file) @@ -60,7 +71,6 @@ def main(FLAGS_output_dir, FLAGS_init_checkpoint_squad, FLAGS_export_dir, FLAGS_ tf.gfile.MakeDirs(FLAGS_output_dir) - tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS_vocab_file, do_lower_case=FLAGS_do_lower_case) @@ -68,7 +78,6 @@ def main(FLAGS_output_dir, FLAGS_init_checkpoint_squad, FLAGS_export_dir, FLAGS_ if FLAGS_use_tpu and FLAGS_tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS_tpu_name, zone=FLAGS_tpu_zone, project=FLAGS_gcp_project) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, @@ -86,8 +95,7 @@ def main(FLAGS_output_dir, FLAGS_init_checkpoint_squad, FLAGS_export_dir, FLAGS_ if FLAGS_do_train: train_examples = read_squad_examples( - input_file=FLAGS_train_file, is_training=True,questions = questions,FLAGS_version_2_with_negative = FLAGS_version_2_with_negative) - + input_file=FLAGS_train_file, is_training=True, questions=questions, FLAGS_version_2_with_negative=FLAGS_version_2_with_negative) num_train_steps = int( len(train_examples) / FLAGS_train_batch_size * FLAGS_num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS_warmup_proportion) @@ -174,7 +182,7 @@ class AI2Flask: @app.route('/api/offline/train', methods=['POST']) def text_analyse(): - if not request.json or not 'task_id' in request.json: + if not request.json or 'task_id' not in request.json: abort(400) if check_threads(): return jsonify({"Des": "Task list is full. Can not submit new task! ", "Result": "Failed to submit the training task ", "Status": "ERROR"}) @@ -227,8 +235,6 @@ class AI2Flask: except Exception as e: return jsonify({"Des": str(e), "Result": 'None', "Status": "Error"}) - - @app.route('/api/offline/status', methods=['POST']) def todo_status(): task_id = request.json['task_id'] diff --git a/scripts/api_squad_online.py b/scripts/api_squad_online.py index 9cc6b08..abe3d5f 100644 --- a/scripts/api_squad_online.py +++ b/scripts/api_squad_online.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # coding: utf-8 @@ -6,30 +5,15 @@ # date = 20201204 - from __future__ import absolute_import from __future__ import division from __future__ import print_function import json -import datetime -import threading import sys from flask import Flask, abort, request, jsonify -from concurrent.futures import ThreadPoolExecutor -import collections -import math import os -import random -import modeling -import optimization -import tokenization -import six -import tensorflow as tf -import pandas as pd -import numpy as np -import requests -from global_setting import * +from global_setting import questions, tokenizer_ch, CUDA_VISIBLE_DEVICES from create_squad_features import get_squad_feature_result @@ -38,17 +22,15 @@ app = Flask(__name__) class AI2Flask: - def __init__(self, port=5000,workers=4): + def __init__(self, port=5000, workers=4): self.app = app self.port = port - - @app.route('/api/online/predict', methods=['POST']) def text_analyse(): if not request.json: abort(400) - + else: try: try: @@ -56,7 +38,6 @@ class AI2Flask: except: title = 'Not available' text_origin = request.json['text'] - if len(text_origin) > 800: text = text_origin[:800] @@ -65,23 +46,16 @@ class AI2Flask: result = {} for ques in questions: - tmp = get_squad_feature_result(title=title,text=text,tokenizer=tokenizer_ch,question=[ques],url='http://localhost:8502/v1/models/predict:predict') + tmp = get_squad_feature_result(title=title, text=text, tokenizer=tokenizer_ch, question=[ques], url='http://localhost:8502/v1/models/predict:predict') result[ques] = dict(tmp)[ques] - - + print('finished!!') return json.dumps(result) - - + except KeyError as e: return jsonify({"Des": 'KeyError: {}'.format(str(e)), "Result": 'None', "Status": "Error"}) except Exception as e: return jsonify({"Des": str(e), "Result": 'None', "Status": "Error"}) - - - - - @app.route('/api/online/load', methods=['POST']) def load_model(): @@ -105,4 +79,3 @@ class AI2Flask: if __name__ == '__main__': port = sys.argv[1] AI2Flask(port=port).start() - diff --git a/scripts/create_squad_features.py b/scripts/create_squad_features.py index e779b9e..ce274e0 100644 --- a/scripts/create_squad_features.py +++ b/scripts/create_squad_features.py @@ -1,38 +1,28 @@ +#!/usr/bin/env python +# coding: utf-8 - #!/usr/bin/env python - # coding: utf-8 - - # auther = 'liuzhiyong' - # date = 20201204 +# auther = 'liuzhiyong' +# date = 20201204 from __future__ import absolute_import from __future__ import division from __future__ import print_function import json -import datetime -import threading -import time -from flask import Flask, abort, request, jsonify -from concurrent.futures import ThreadPoolExecutor import collections import math -import os -import random -import modeling -import optimization import tokenization import six import tensorflow as tf -import sys import requests -from global_setting import * +from global_setting import _improve_answer_span version_2_with_negative = True -def get_squad_feature_result(title,text,tokenizer,question, url): + +def get_squad_feature_result(title, text, tokenizer, question, url): def make_json(title, text, question): res = {} @@ -59,7 +49,6 @@ def get_squad_feature_result(title,text,tokenizer,question, url): res['data'].append(data_inside.copy()) return json.dumps(res) - def _compute_softmax(scores): """Compute softmax probability over raw logits.""" if not scores: @@ -83,7 +72,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): return probs def get_final_text(pred_text, orig_text, do_lower_case): - + def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() @@ -152,7 +141,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): return output_text def _get_best_indexes(logits, n_best_size): - + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] @@ -164,8 +153,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) - def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case): + def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case): """Write final predictions to the json file and log-odds of null if needed.""" example_index_to_features = collections.defaultdict(list) @@ -236,19 +224,19 @@ def get_squad_feature_result(title,text,tokenizer,question, url): if version_2_with_negative: prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=0, - end_index=0, - start_logit=null_start_logit, - end_logit=null_end_logit)) + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] @@ -282,10 +270,10 @@ def get_squad_feature_result(title,text,tokenizer,question, url): seen_predictions[final_text] = True nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) # if we didn't inlude the empty option in the n-best, inlcude it if version_2_with_negative: @@ -299,7 +287,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( - _NbestPrediction(text="", start_logit=0.0, end_logit=0.0)) + _NbestPrediction(text="", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -339,30 +327,28 @@ def get_squad_feature_result(title,text,tokenizer,question, url): all_nbest_json[example.qas_id] = nbest_json return all_predictions - def create_int_feature(values): feature = tf.train.Feature( int64_list=tf.train.Int64List(value=list(values))) return feature - class InputFeatures(object): """A single set of features of data.""" def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - start_position=None, - end_position=None, - is_impossible=None): + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): self.unique_id = unique_id self.example_index = example_index self.doc_span_index = doc_span_index @@ -413,7 +399,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): return cur_span_index == best_span_index def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training): + doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -487,7 +473,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) + split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) @@ -518,8 +504,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 @@ -574,22 +559,21 @@ def get_squad_feature_result(title,text,tokenizer,question, url): is_impossible=example.is_impossible) # Run callback - + result.append(feature) unique_id += 1 return result class SquadExample(object): - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=False): + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens @@ -615,8 +599,6 @@ def get_squad_feature_result(title,text,tokenizer,question, url): s += ", is_impossible: %r" % (self.is_impossible) return s - - def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" @@ -654,7 +636,6 @@ def get_squad_feature_result(title,text,tokenizer,question, url): is_impossible = False if is_training: - if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer.") @@ -664,8 +645,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - - 1] + end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. @@ -678,7 +658,7 @@ def get_squad_feature_result(title,text,tokenizer,question, url): tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) + actual_text, cleaned_answer_text) continue else: start_position = -1 @@ -697,27 +677,24 @@ def get_squad_feature_result(title,text,tokenizer,question, url): return examples + def get_result(title, text, question, url): - def get_result(title,text,question,url): - - data = make_json(title,text,question) - - - examples = read_squad_examples(data,False) + data = make_json(title, text, question) + examples = read_squad_examples(data, False) predict_files = convert_examples_to_features( - examples=examples, - tokenizer=tokenizer, - max_seq_length=512, - doc_stride=128, - max_query_length=100, - is_training=False, + examples=examples, + tokenizer=tokenizer, + max_seq_length=512, + doc_stride=128, + max_query_length=100, + is_training=False, ) - + headers = {"content-type": "application/json"} all_results = [] - for predict_file in predict_files: + for predict_file in predict_files: features = {} features["unique_ids"] = predict_file.unique_id features["input_mask"] = predict_file.input_mask @@ -725,22 +702,20 @@ def get_squad_feature_result(title,text,tokenizer,question, url): features["input_ids"] = predict_file.input_ids data_list = [] data_list.append(features) - + data = json.dumps({"instances": data_list}) - + json_response = requests.post(url, data=data, headers=headers) - x = json.loads(json_response.text) - + all_results.append( RawResult( unique_id=predict_file.unique_id, start_logits=x['predictions'][0]['start_logits'], end_logits=x['predictions'][0]['end_logits'])) - - result = write_predictions(examples, predict_files, all_results,20, 64,True) + + result = write_predictions(examples, predict_files, all_results, 20, 64, True) return result return get_result(title, text, question, url) - diff --git a/scripts/global_setting.py b/scripts/global_setting.py index bb035f9..51dfec1 100644 --- a/scripts/global_setting.py +++ b/scripts/global_setting.py @@ -3,34 +3,29 @@ from __future__ import division from __future__ import print_function -import collections -import math -import modeling -import optimization +# import collections +# import math +# import modeling +# import optimization import tokenization -import six -import tensorflow as tf -import os +# import six +# import tensorflow as tf +# import os - - - -### Global variables +# Global variables # GPU number, default: -1, means not used -CUDA_VISIBLE_DEVICES="2" +CUDA_VISIBLE_DEVICES = "2" # Questions to be trained/predicted -questions = ['Communication Service Name','Max Number of UEs','Data Rate Downlink','Latency','Data Rate Uplink','Resource Sharing Level','Mobility','Area'] +questions = ['Communication Service Name', 'Max Number of UEs', 'Data Rate Downlink', 'Latency', 'Data Rate Uplink', 'Resource Sharing Level', 'Mobility', 'Area'] # Configuration file -FLAGS_bert_config_file = '/home/run/chinese_L-12_H-768_A-12/bert_config.json' -FLAGS_vocab_file = '/home/run/chinese_L-12_H-768_A-12/vocab.txt' -FLAGS_init_checkpoint_squad = '/home/run/chinese_L-12_H-768_A-12/bert_model.ckpt' +FLAGS_bert_config_file = '/home/run/uncased_L-12_H-768_A-12/bert_config.json' +FLAGS_vocab_file = '/home/run/uncased_L-12_H-768_A-12/vocab.txt' +FLAGS_init_checkpoint_squad = '/home/run/uncased_L-12_H-768_A-12/bert_model.ckpt' max_seq_length = 512 tokenizer_ch = tokenization.FullTokenizer(vocab_file=FLAGS_vocab_file, do_lower_case=True) - - diff --git a/test_1.py b/test_1.py new file mode 100644 index 0000000..3e50176 --- /dev/null +++ b/test_1.py @@ -0,0 +1,24 @@ +# -*- coding:utf-8 -*- +import pytest + + +@pytest.fixture(scope='function') +def setup_function(request): + def teardown_function(): + print("teardown_function called.") + request.addfinalizer(teardown_function) # 此内嵌函数做teardown工作 + print('setup_function called.') + + +@pytest.fixture(scope='module') +def setup_module(request): + def teardown_module(): + print("teardown_module called.") + request.addfinalizer(teardown_module) + print('setup_module called.') + + +# @pytest.mark.website +def test_1(setup_function): + print('Test_1 called.') + assert 1 == 1 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..2b6716e --- /dev/null +++ b/tox.ini @@ -0,0 +1,26 @@ +# content of: tox.ini , put in same dir as setup.py +[tox] +envlist = py36,pep8,cov +skipsdist = true + +[flake8] +ignore = E501,E722 +exclude = ./venv-tox,./.tox,./venv,./docs + +[testenv:pep8] +deps = flake8 +commands = flake8 + +[testenv] +deps = -r{toxinidir}/requirements.txt +commands = pytest + +[testenv:py36] +commands = + {[testenv]commands} + +[testenv:cov] +deps = pytest + pytest-cov +commands = pytest --cov-report=html + diff --git a/version.properties b/version.properties new file mode 100644 index 0000000..6c74c88 --- /dev/null +++ b/version.properties @@ -0,0 +1,27 @@ +# Copyright (C) 2017 CTC, Inc. and others. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Versioning variables +# Note that these variables cannot be structured (e.g. : version.release or version.snapshot etc... ) +# because they are used in Jenkins, whose plug-in doesn't support + +major=1 +minor=0 +patch=0 + +base_version=${major}.${minor}.${patch} + +# Release must be completed with git revision # in Jenkins +release_version=${base_version} +snapshot_version=${base_version}-SNAPSHOT -- cgit 1.2.3-korg