-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlaunch.py
215 lines (181 loc) · 7.24 KB
/
launch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import os
import sys
import json
import glob
import pathlib
import argparse
import bs4
import datetime
import time
import random
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from xml.etree import ElementTree
from pyknp import Juman
from preprocess.metadata import extract_meta
from preprocess.extracttext import extract_meta_add_text
from annotation_cleanup.classifier import extract_meta_add_keyword_classify
from classifiers.bertsimple import extract_meta_add_bert_info, init_bert, init_classifier
gpu_num = '0'
batch = 32
data_dir = '/mnt/hinoki/share/covid19'
input_dir = '/mnt/hinoki/share/covid19/run/new-translated-files'
output_dir = '/mnt/hinoki/share/covid19/topics_bert'
blacklist = '{}/url_black_list.txt'.format(data_dir)
sourceinfo_file = '{}/source_info_en.tsv'.format(data_dir)
output = '{}/output.jsonl'.format(data_dir)
classes_file = './classifiers/classes.txt'
bert_dir = './model'
bert_model = '{}/Japanese_L-12_H-768_A-12_E-30_BPE_transformers'.format(bert_dir) # dir for the bert model itself
bertsimple_model = '{}/classifier.pth'.format(bert_dir) # model of the classifier, bert+linear layers
keyword_file = './classifiers/keywords.txt'
translated_log_dir = '/mnt/hinoki/share/covid19/run/trans_log_song'
ja_translated_list_file = '{}/trans_list.txt'.format(translated_log_dir)
en_translated_list_file = '{}/en_trans_list.txt'.format(translated_log_dir)
xml_log_dir = '/mnt/hinoki/share/covid19/run/new-xml-files'
output_dir = '/mnt/hinoki/share/covid19/run/topic_classification_log'
output_jsonl = '{}/output.jsonl'.format(output_dir)
output_txt = '{}/topic_classified_list.txt'.format(output_dir)
#output_errored_txt = '{}/topic_classified_list_errored.txt'.format(output_dir)
# 1. get process files from new_translate (Japanese)
# 2. for each file, generate metadata, generate bert classification result
def full_path_to_related_path(full_path):
return full_path.replace('/mnt/hinoki/share/covid19','.')
def write_to_access(input_path, status):
with open(output_txt, "a+") as f:
output_res = "{} {}\n".format(input_path.strip(), status)
f.write(output_res)
def process_one_file(input_path):
status = 0
try:
related_path = full_path_to_related_path(input_path)
meta = extract_meta(data_dir, sourceinfo_file, related_path)
meta = extract_meta_add_text(data_dir, meta, juman)
meta = extract_meta_add_keyword_classify(keyword_file, meta)
meta = extract_meta_add_bert_info(meta, classes, bert, classifier, tokenizer, batch, device)
except:
status = 1
if (status == 0):
with open(output_jsonl, "a+") as f:
output_res = json.dumps(meta, ensure_ascii=False)
f.write(output_res.strip() + '\n')
else:
print ("classification error")
write_to_access(input_path, status)
def get_xml_converted_list(xml_log_dir):
names = glob.glob("{}/new-xml-files*txt".format(xml_log_dir))
all_xml_file = "{}/xmled_files.txt".format(xml_log_dir)
xml_list = []
with open(all_xml_file, "w") as of:
for name in names:
try:
lines = open(name, "r").readlines()
except:
continue
for line in lines:
xml_list.append(line.strip())
of.write(line.strip()+'\n')
return xml_list
def get_feature(full_name):
related_name = full_path_to_related_path(full_name)
try:
_, country, _, domain, *url_parts = pathlib.Path(related_name.strip()).parts
suf = url_parts[-1].split('.')[-1]
url_parts[-1] = url_parts[-1].replace(suf, '')
feature = '/'.join(url_parts)
except:
return ''
return feature
def get_unprocessed_list():
#_, country, _, domain, *url_parts = pathlib.Path(line.strip()).parts
ja_translated_list = []
ja_lines = open(ja_translated_list_file, "r").readlines()
for line in ja_lines:
line_list = line.strip().split()
if (line_list[-1]=='0'):
ja_translated_list.append(line_list[0])
en_translated_list = []
en_lines = open(en_translated_list_file, "r").readlines()
for line in en_lines:
line_list = line.strip().split()
if (line_list[-1]=='0'):
en_translated_list.append(line_list[0])
xml_converted_list = get_xml_converted_list(xml_log_dir)
classified_list = []
for line in open(output_txt, "r").readlines():
try:
res = line.strip().split()[0]
classified_list.append(res)
except:
continue
en_translated_dict = {}
xml_converted_dict = {}
classified_dict = {}
unprocessed_list = []
for en_translated_file in en_translated_list:
feature = get_feature(en_translated_file)
en_translated_dict[feature]=1
for xml_file in xml_converted_list:
feature = get_feature(xml_file)
xml_converted_dict[feature]=1
for classified_file in classified_list:
feature = get_feature(classified_file)
classified_dict[feature]=1
for ja_translated_file in ja_translated_list:
feature = get_feature(ja_translated_file)
if (en_translated_dict.get(feature, 0) == 1) and \
(xml_converted_dict.get(feature, 0) == 1) and \
(classified_dict.get(feature, 0) == 0):
unprocessed_list.append(ja_translated_file)
unprocessed_list.reverse()
return unprocessed_list
with open(classes_file, "r") as f:
classes = [line.strip() for line in f]
# topic classifier
device = "cuda:{}".format(gpu_num)
bert, tokenizer = init_bert(bert_model)
bert.eval()
bert.to(device)
classifier = init_classifier(768, 11)
classifier.load_state_dict(torch.load(bertsimple_model, map_location=device))
classifier.eval()
classifier.to(device)
juman = Juman()
test_flag = False
if (test_flag == True):
input_path = './html/fr/en_translated/www.lemonde.fr/signataires/francois-beguin/?page=3/2020/11/03-08-45/?page=3.html'
related_path = full_path_to_related_path(input_path)
meta = extract_meta(data_dir, sourceinfo_file, related_path)
meta = extract_meta_add_text(data_dir, meta, juman)
meta = extract_meta_add_keyword_classify(keyword_file, meta)
meta = extract_meta_add_bert_info(meta, classes, bert, classifier, tokenizer, batch, device)
#meta = extract_meta_add_sentiment(meta, sent_model, juman, sent_tokenizer)
#print (meta)
#print (meta['ja_translated']['title'])
#print (meta['sentiment'])
output_res = json.dumps(meta, ensure_ascii=False)
print (output_res)
exit()
itr = 0
while (1):
unprocessed_list = get_unprocessed_list()
unprocessed_len = len(unprocessed_list)
print ("----------------------------------------------------")
print (f"Topic Classification:There are {unprocessed_len} unprocessed files")
print ("----------------------------------------------------")
if (unprocessed_len == 0):
time.sleep(10)
continue
itr += 1
print ("Iteration {}".format(itr))
for i, input_path in enumerate(unprocessed_list):
print (i, input_path)
try:
process_one_file(input_path)
except:
continue
time.sleep(100)