Skip to content

Instantly share code, notes, and snippets.

@HeenaR17
Created December 21, 2020 17:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HeenaR17/edc0227c3449825c749659eac13581a0 to your computer and use it in GitHub Desktop.
Save HeenaR17/edc0227c3449825c749659eac13581a0 to your computer and use it in GitHub Desktop.
german_to_english = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')
data = ["back translation is one of the best data augmentation techniques"]
def augment_data(data, x_to_y, y_to_x, n):
augmented_data = dict()
for d in data:
augmented_data[d] = list()
y_result = x_to_y.generate(x_to_y.encode(d), beam=n)
for y in y_result:
x_result = y_to_x.generate(y_to_x.encode(x_to_y.decode(y['tokens'])), beam=n)
for x in x_result:
augmented_data[d].append(y_to_x.decode(x['tokens']))
return augmented_data
def print_data(data):
for inp, out in data.items():
print(inp + ":")
for x in out:
print(" "+ x)
result = augment_data(data, english_to_german, german_to_english, 3)
print_data(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment