from datasets import list_datasets
Understanding Huggingface datasets
nlp
huggingface
This blog will dive into huggingface datasets
Get the list of all avaliable datasets from haggingface
= list_datasets()
all_datasets len(all_datasets), all_datasets[:10]
(14199,
['acronym_identification',
'ade_corpus_v2',
'adversarial_qa',
'aeslc',
'afrikaans_ner_corpus',
'ag_news',
'ai2_arc',
'air_dialogue',
'ajgt_twitter_ar',
'allegro_reviews'])
Load a dataset from huggingface
from datasets import load_dataset
Load the whole dataset
= load_dataset('emotion')
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
DatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 16000
})
validation: Dataset({
features: ['text', 'label'],
num_rows: 2000
})
test: Dataset({
features: ['text', 'label'],
num_rows: 2000
})
})
Load part of the dataset
= load_dataset('emotion', split='train')
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
Dataset({
features: ['text', 'label'],
num_rows: 16000
})
= load_dataset('emotion', split=['validation', 'test'])
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
[Dataset({
features: ['text', 'label'],
num_rows: 2000
}),
Dataset({
features: ['text', 'label'],
num_rows: 2000
})]
Load a csv
file from remote or local
import pandas as pd
import numpy as np
= 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv'
trn_csv = 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv' tst_csv
= pd.read_csv(trn_csv)
trn_df = trn_df.columns[2:]
label_cols label_cols
Index(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar',
'conventions'],
dtype='object')
= load_dataset("csv", data_files=trn_csv) # or several files []
trn_ds trn_ds
Using custom data configuration default-8f5d5f2de1b27b24
Reusing dataset csv (/Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3911
})
})
If you want to split the training set into train and validation parts, you can do it as below
= trn_ds['train'].train_test_split(test_size=0.2) # Here we use 20% samples as validation part
ds ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3128
})
test: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 783
})
})
If you want to use param “stratify”
like scikit-learn, you can do it as below
Let say we want use cohesion
column.
trn_ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3911
})
})
= trn_ds.class_encode_column('cohesion')
ds 'train'].features ds[
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-8be23562eeb71b18.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b4dba551c432230d.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-3e6fbdaa550cb114.arrow
{'text_id': Value(dtype='string', id=None),
'full_text': Value(dtype='string', id=None),
'cohesion': ClassLabel(num_classes=9, names=['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0'], id=None),
'syntax': Value(dtype='float64', id=None),
'vocabulary': Value(dtype='float64', id=None),
'phraseology': Value(dtype='float64', id=None),
'grammar': Value(dtype='float64', id=None),
'conventions': Value(dtype='float64', id=None)}
= ds['train'].train_test_split(test_size=0.2, stratify_by_column="cohesion")
ds ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3128
})
test: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 783
})
})