# Giskard-AI/giskard

# test_data_processing_pipeline.py3 代码处理

# 原 test 代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import numpy as np
import pandas as pd
import pytest

from giskard import Dataset, SlicingFunction, slicing_function
from giskard.registry.transformation_function import transformation_function


@slicing_function(name="slice with parenthesis")
def filter_with_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 1000


@slicing_function(name="slice cell level", cell_level=True)
def filter_cell_level(amount: int) -> bool:
return amount > 1000


@slicing_function
def filter_without_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 2000


@transformation_function(name="transform with parenthesis")
def transform_with_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -1
return x


@transformation_function
def transform_without_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x


@transformation_function
def transform_divide_by_five(x: pd.Series) -> pd.Series:
x.credit_amount /= 5
return x


@transformation_function(cell_level=True)
def column_level_divide(nb: float, amount: int) -> float:
return nb / amount


def test_slicing(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000
assert isinstance(filter_with_parenthesis, SlicingFunction), f"{type(filter_with_parenthesis)}"
ds = german_credit_data.slice(filter_with_parenthesis)
assert len(ds.df) == 884
ds = ds.slice(filter_without_parenthesis)
assert len(ds.df) == 568


def test_slicing_using_lambda(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000
ds = german_credit_data.slice(lambda x: x.credit_amount > 1000)
assert len(ds.df) == 884
ds = ds.slice(lambda x: x.credit_amount > 2000)
assert len(ds.df) == 568


def test_slicing_cell_level(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000
assert isinstance(filter_with_parenthesis, SlicingFunction), f"{type(filter_with_parenthesis)}"
ds = german_credit_data.slice(filter_cell_level, column_name="credit_amount")
assert len(ds.df) == 884
ds = ds.slice(lambda amount: amount > 2000, cell_level=True, column_name="credit_amount")
assert len(ds.df) == 568


def test_chain(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000
german_credit_data.add_slicing_function(filter_without_parenthesis)
german_credit_data.add_transformation_function(transform_divide_by_five)
german_credit_data.add_slicing_function(filter_with_parenthesis)
assert len(german_credit_data.df) == 1000
ds = german_credit_data.process()
assert len(ds.df) == 188


def test_transform_cell_level(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(amount=5), column_name="credit_amount")
.slice(filter_with_parenthesis)
)

assert len(german_credit_data.df) == 1000
assert len(ds.df) == 188


def test_transform_cell_level_parameterized(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(column_name="credit_amount", amount=5))
.slice(filter_with_parenthesis)
)

assert len(german_credit_data.df) == 1000
assert len(ds.df) == 188


def test_transform_cell_level_lambda(german_credit_data: Dataset):
assert len(german_credit_data.df) == 1000

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(lambda i: i / 5, cell_level=True, column_name="credit_amount")
.slice(filter_with_parenthesis)
)

assert len(german_credit_data.df) == 1000
assert len(ds.df) == 188


def test_transformation(german_credit_data: Dataset):
ds = german_credit_data.transform(transform_without_parenthesis)
assert np.all(ds.df.credit_amount == -2)
ds = german_credit_data.transform(transform_with_parenthesis)
assert np.all(ds.df.credit_amount == -1)
assert len(german_credit_data.df) == 1000
assert len(german_credit_data.df.credit_amount.unique()) > 1


def test_transformation_without_annotation(german_credit_data: Dataset):
def transform_without_annotation(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x

ds = german_credit_data.transform(transform_without_annotation)
assert np.all(ds.df.credit_amount == -2)
assert len(german_credit_data.df) == 1000
assert len(german_credit_data.df.credit_amount.unique()) > 1


def test_missing_arg_slicing_function():
with pytest.raises(
TypeError, match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but none was defined"
):

@slicing_function
def slice_fn():
return True


def test_wrong_type_slicing_function():
with pytest.raises(
TypeError,
match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but <class 'int'> was defined",
):

@slicing_function
def slice_fn(row: int):
return row > 0

slice_fn("str")


def test_chain_with_parameters(german_credit_data: Dataset):
@slicing_function(name="row greater than")
def filter_greater_than(x: pd.Series, row: str, threshold: int) -> bool:
return x[row] > threshold

@transformation_function
def transform_divide_by(x: pd.Series, row: str, divider: int) -> pd.Series:
x[row] /= divider
return x

assert len(german_credit_data.df) == 1000
german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 2000))
german_credit_data.add_transformation_function(transform_divide_by("credit_amount", 5))
german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 1000))
assert len(german_credit_data.df) == 1000
ds = german_credit_data.process()
assert len(ds.df) == 188


def test_transformation_without_type():
@transformation_function(row_level=True)
def add_positive_sentence(row):
row = row.copy()
row.text += " I love this!"
return row

df = pd.DataFrame([{"text": "testing."}])
dataset = Dataset(df, cat_columns=[])
transformed_dataset = dataset.transform(add_positive_sentence)

assert transformed_dataset.df.iloc[0].text == "testing. I love this!"

# 去除 assert 后的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import numpy as np
import pandas as pd
import pytest

from giskard import Dataset, SlicingFunction, slicing_function
from giskard.registry.transformation_function import transformation_function


@slicing_function(name="slice with parenthesis")
def filter_with_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 1000


@slicing_function(name="slice cell level", cell_level=True)
def filter_cell_level(amount: int) -> bool:
return amount > 1000


@slicing_function
def filter_without_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 2000


@transformation_function(name="transform with parenthesis")
def transform_with_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -1
return x


@transformation_function
def transform_without_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x


@transformation_function
def transform_divide_by_five(x: pd.Series) -> pd.Series:
x.credit_amount /= 5
return x


@transformation_function(cell_level=True)
def column_level_divide(nb: float, amount: int) -> float:
return nb / amount


def test_slicing(german_credit_data: Dataset):
ds = german_credit_data.slice(filter_with_parenthesis)
ds = ds.slice(filter_without_parenthesis)


def test_slicing_using_lambda(german_credit_data: Dataset):
ds = german_credit_data.slice(lambda x: x.credit_amount > 1000)
ds = ds.slice(lambda x: x.credit_amount > 2000)


def test_slicing_cell_level(german_credit_data: Dataset):
ds = german_credit_data.slice(filter_cell_level, column_name="credit_amount")
ds = ds.slice(lambda amount: amount > 2000, cell_level=True, column_name="credit_amount")


def test_chain(german_credit_data: Dataset):
german_credit_data.add_slicing_function(filter_without_parenthesis)
german_credit_data.add_transformation_function(transform_divide_by_five)
german_credit_data.add_slicing_function(filter_with_parenthesis)
ds = german_credit_data.process()


def test_transform_cell_level(german_credit_data: Dataset):

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(amount=5), column_name="credit_amount")
.slice(filter_with_parenthesis)
)



def test_transform_cell_level_parameterized(german_credit_data: Dataset):

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(column_name="credit_amount", amount=5))
.slice(filter_with_parenthesis)
)



def test_transform_cell_level_lambda(german_credit_data: Dataset):

ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(lambda i: i / 5, cell_level=True, column_name="credit_amount")
.slice(filter_with_parenthesis)
)



def test_transformation(german_credit_data: Dataset):
ds = german_credit_data.transform(transform_without_parenthesis)
ds = german_credit_data.transform(transform_with_parenthesis)



def test_transformation_without_annotation(german_credit_data: Dataset):
def transform_without_annotation(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x

ds = german_credit_data.transform(transform_without_annotation)



def test_missing_arg_slicing_function():
with pytest.raises(
TypeError, match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but none was defined"
):

@slicing_function
def slice_fn():
return True


def test_wrong_type_slicing_function():
with pytest.raises(
TypeError,
match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but <class 'int'> was defined",
):

@slicing_function
def slice_fn(row: int):
return row > 0

slice_fn("str")


def test_chain_with_parameters(german_credit_data: Dataset):
@slicing_function(name="row greater than")
def filter_greater_than(x: pd.Series, row: str, threshold: int) -> bool:
return x[row] > threshold

@transformation_function
def transform_divide_by(x: pd.Series, row: str, divider: int) -> pd.Series:
x[row] /= divider
return x

german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 2000))
german_credit_data.add_transformation_function(transform_divide_by("credit_amount", 5))
german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 1000))
ds = german_credit_data.process()


def test_transformation_without_type():
@transformation_function(row_level=True)
def add_positive_sentence(row):
row = row.copy()
row.text += " I love this!"
return row

df = pd.DataFrame([{"text": "testing."}])
dataset = Dataset(df, cat_columns=[])
transformed_dataset = dataset.transform(add_positive_sentence)

# gpt3.5 自动填充的代码

base prompt 为:我这个 tests 代码缺少 assert 断言,请你自动填充

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import numpy as np
import pandas as pd
import pytest

from giskard import Dataset, SlicingFunction, slicing_function
from giskard.registry.transformation_function import transformation_function


@slicing_function(name="slice with parenthesis")
def filter_with_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 1000


@slicing_function(name="slice cell level", cell_level=True)
def filter_cell_level(amount: int) -> bool:
return amount > 1000


@slicing_function
def filter_without_parenthesis(x: pd.Series) -> bool:
return x.credit_amount > 2000


@transformation_function(name="transform with parenthesis")
def transform_with_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -1
return x


@transformation_function
def transform_without_parenthesis(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x


@transformation_function
def transform_divide_by_five(x: pd.Series) -> pd.Series:
x.credit_amount /= 5
return x


@transformation_function(cell_level=True)
def column_level_divide(nb: float, amount: int) -> float:
return nb / amount


def test_slicing(german_credit_data: Dataset):
ds = german_credit_data.slice(filter_with_parenthesis)
assert len(ds) > 0 # Example assertion
ds = ds.slice(filter_without_parenthesis)
assert len(ds) > 0 # Example assertion


def test_slicing_using_lambda(german_credit_data: Dataset):
ds = german_credit_data.slice(lambda x: x.credit_amount > 1000)
assert len(ds) > 0 # Example assertion
ds = ds.slice(lambda x: x.credit_amount > 2000)
assert len(ds) > 0 # Example assertion


def test_slicing_cell_level(german_credit_data: Dataset):
ds = german_credit_data.slice(filter_cell_level, column_name="credit_amount")
assert len(ds) > 0 # Example assertion
ds = ds.slice(lambda amount: amount > 2000, cell_level=True, column_name="credit_amount")
assert len(ds) > 0 # Example assertion


def test_chain(german_credit_data: Dataset):
german_credit_data.add_slicing_function(filter_without_parenthesis)
german_credit_data.add_transformation_function(transform_divide_by_five)
german_credit_data.add_slicing_function(filter_with_parenthesis)
ds = german_credit_data.process()
assert len(ds) > 0 # Example assertion


def test_transform_cell_level(german_credit_data: Dataset):
ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(amount=5), column_name="credit_amount")
.slice(filter_with_parenthesis)
)
assert len(ds) > 0 # Example assertion


def test_transform_cell_level_parameterized(german_credit_data: Dataset):
ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(column_level_divide(column_name="credit_amount", amount=5))
.slice(filter_with_parenthesis)
)
assert len(ds) > 0 # Example assertion


def test_transform_cell_level_lambda(german_credit_data: Dataset):
ds = (
german_credit_data.slice(filter_without_parenthesis)
.transform(lambda i: i / 5, cell_level=True, column_name="credit_amount")
.slice(filter_with_parenthesis)
)
assert len(ds) > 0 # Example assertion


def test_transformation(german_credit_data: Dataset):
ds = german_credit_data.transform(transform_without_parenthesis)
assert len(ds) > 0 # Example assertion
ds = german_credit_data.transform(transform_with_parenthesis)
assert len(ds) > 0 # Example assertion


def test_transformation_without_annotation(german_credit_data: Dataset):
def transform_without_annotation(x: pd.Series) -> pd.Series:
x.credit_amount = -2
return x

ds = german_credit_data.transform(transform_without_annotation)
assert len(ds) > 0 # Example assertion


def test_missing_arg_slicing_function():
with pytest.raises(
TypeError, match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but none was defined"
):
@slicing_function
def slice_fn():
return True


def test_wrong_type_slicing_function():
with pytest.raises(
TypeError,
match="Required arg 0 of slice_fn to be <class 'pandas.core.series.Series'>, but <class 'int'> was defined",
):
@slicing_function
def slice_fn(row: int):
return row > 0

slice_fn("str")


def test_chain_with_parameters(german_credit_data: Dataset):
@slicing_function(name="row greater than")
def filter_greater_than(x: pd.Series, row: str, threshold: int) -> bool:
return x[row] > threshold

@transformation_function
def transform_divide_by(x: pd.Series, row: str, divider: int) -> pd.Series:
x[row] /= divider
return x

german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 2000))
german_credit_data.add_transformation_function(transform_divide_by("credit_amount", 5))
german_credit_data.add_slicing_function(filter_greater_than("credit_amount", 1000))
ds = german_credit_data.process()
assert len(ds) > 0 # Example assertion


def test_transformation_without_type():
@transformation_function(row_level=True)
def add_positive_sentence(row):
row = row.copy()
row.text += " I love this!"
return row

df = pd.DataFrame([{"text": "testing."}])
dataset = Dataset(df, cat_columns=[])
transformed_dataset = dataset.transform(add_positive_sentence)
assert len(transformed_dataset) > 0 # Example assertion