# sumeval

# 简介

  • 不仅支持英文,还支持日文和中文,其他语言也可以轻松扩展
  • 使用方法 pip install sumeval

# test_rouge.py 代码分析

# test_rouge.py 原代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import json
import sys
import unittest
from rougescore import rouge_n, rouge_l
from pythonrouge.pythonrouge import Pythonrouge
from sumeval.metrics.rouge import RougeCalculator


class TestRouge(unittest.TestCase):
DATA_DIR = os.path.join(os.path.dirname(__file__), "data/rouge")

def load_test_data(self):
test_file = os.path.join(self.DATA_DIR, "ROUGE-test.json")
with open(test_file, encoding="utf-8") as f:
data = json.load(f)
return data

def test_rouge(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=False)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=False)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertLess(abs(b2_v - v), 1e-5)
self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v), 1e-5) # noqa

def test_rouge_with_stop_word(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertLess(abs(b2_v - v), 1e-5)
self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v), 1e-5) # noqa

def test_rouge_with_length_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, length_limit=50)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=50,
word_level=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertLess(abs(b2_v - v), 1e-5)
self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v), 1e-5) # noqa

def test_rouge_with_word_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, word_limit=5)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=5,
word_level=True,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertLess(abs(b2_v - v), 1e-5)
self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v), 1e-5) # noqa

def test_rouge_l(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=1, recall_only=False, ROUGE_L=True,
length_limit=True, length=50,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_l(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
0.5)
v = rouge.rouge_l(s, references)
self.assertLess(abs(b2_v - v), 1e-5)
self.assertLess(abs(b1_v["ROUGE-L-F"] - v), 1e-5)


if __name__ == "__main__":
unittest.main()

# 去掉 assert 断言和一些辅助说明后喂给 LLM 的 test 代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
import sys
import unittest
from rougescore import rouge_n, rouge_l
from pythonrouge.pythonrouge import Pythonrouge
from sumeval.metrics.rouge import RougeCalculator


class TestRouge(unittest.TestCase):
DATA_DIR = os.path.join(os.path.dirname(__file__), "data/rouge")

def load_test_data(self):
test_file = os.path.join(self.DATA_DIR, "ROUGE-test.json")
with open(test_file, encoding="utf-8") as f:
data = json.load(f)
return data

def test_rouge(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=False)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=False)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)

def test_rouge_with_stop_word(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)

def test_rouge_with_length_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, length_limit=50)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=50,
word_level=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)

def test_rouge_with_word_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, word_limit=5)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=5,
word_level=True,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)

def test_rouge_l(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=1, recall_only=False, ROUGE_L=True,
length_limit=True, length=50,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_l(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
0.5)
v = rouge.rouge_l(s, references)



if __name__ == "__main__":
unittest.main()

# LLM 自动填充 assert 断言后的代码

**base prompt 为:** 该 tests case 缺少 assert 断言 请你自动填充它

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import json
import sys
import unittest
from rougescore import rouge_n, rouge_l
from pythonrouge.pythonrouge import Pythonrouge
from sumeval.metrics.rouge import RougeCalculator


class TestRouge(unittest.TestCase):
DATA_DIR = os.path.join(os.path.dirname(__file__), "data/rouge")

def load_test_data(self):
test_file = os.path.join(self.DATA_DIR, "ROUGE-test.json")
with open(test_file, encoding="utf-8") as f:
data = json.load(f)
return data

def test_rouge(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=False)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=False)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], b2_v, places=4)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], v, places=4)

def test_rouge_with_stop_word(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], b2_v, places=4)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], v, places=4)

def test_rouge_with_length_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, length_limit=50)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=50,
word_level=False,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], b2_v, places=4)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], v, places=4)

def test_rouge_with_word_limit(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True, word_limit=5)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for n in [1, 2]:
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=n, recall_only=False,
length_limit=True, length=5,
word_level=True,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_n(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
n, 0.5)
v = rouge.rouge_n(s, references, n)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], b2_v, places=4)
self.assertAlmostEqual(b1_v["ROUGE-{}".format(n)], v, places=4)

def test_rouge_l(self):
data = self.load_test_data()
rouge = RougeCalculator(stopwords=True)
for eval_id in data:
summaries = data[eval_id]["summaries"]
references = data[eval_id]["references"]
for s in summaries:
baseline = Pythonrouge(
summary_file_exist=False,
summary=[[s]],
reference=[[[r] for r in references]],
n_gram=1, recall_only=False, ROUGE_L=True,
length_limit=True, length=50,
stemming=False, stopwords=True)
b1_v = baseline.calc_score()
b2_v = rouge_l(rouge.tokenize(s),
[rouge.tokenize(r) for r in references],
0.5)
v = rouge.rouge_l(s, references)
self.assertAlmostEqual(b1_v["ROUGE-L"], b2_v, places=4)
self.assertAlmostEqual(b1_v["ROUGE-L"], v, places=4)


if __name__ == "__main__":
unittest.main()