rt
This commit is contained in:
parent
3951159d55
commit
8d528190a9
@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import functools
|
import functools
|
||||||
import os
|
import re
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -79,22 +79,26 @@ def lru_file_cache(maxsize=128, ttl=None, filename=None):
|
|||||||
return decorator_function
|
return decorator_function
|
||||||
|
|
||||||
|
|
||||||
|
def contains_chinese(string):
|
||||||
|
"""
|
||||||
|
Returns True if the given string contains Chinese characters, False otherwise.
|
||||||
|
"""
|
||||||
|
chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
|
||||||
|
return chinese_regex.search(string) is not None
|
||||||
|
|
||||||
def extract_chinese_characters(file_path):
|
def extract_chinese_characters(file_path):
|
||||||
|
syntax = []
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
chinese_characters = []
|
import ast
|
||||||
sentence = {'file':file_path, 'begin':-1, 'end':-1, 'word': ""}
|
root = ast.parse(content)
|
||||||
for index, char in enumerate(content):
|
for node in ast.walk(root):
|
||||||
if 0x4e00 <= ord(char) <= 0x9fff:
|
if isinstance(node, ast.Name):
|
||||||
sentence['word'] += char
|
if contains_chinese(node.id):
|
||||||
if sentence['begin'] == -1: sentence['begin'] = index
|
print(node.id)
|
||||||
sentence['end'] = index
|
syntax.append(node)
|
||||||
else:
|
|
||||||
if len(sentence['word'])>0:
|
return syntax
|
||||||
chinese_characters.append(sentence)
|
|
||||||
sentence = {'file':file_path, 'begin':-1, 'end':-1, 'word': ""}
|
|
||||||
return chinese_characters
|
|
||||||
|
|
||||||
def extract_chinese_characters_from_directory(directory_path):
|
def extract_chinese_characters_from_directory(directory_path):
|
||||||
chinese_characters = []
|
chinese_characters = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user