skydive/
storage_gcs.rs

1use anyhow::Result;
2
3use chrono::{DateTime, Utc};
4use cloud_storage::{object::ObjectList, sync::Client, ListRequest};
5
6/// Split a GCS path into a bucket name and a prefix.
7/// The GCS path should be in the format `gs://bucket_name/prefix`.
8///
9/// # Arguments
10///
11/// * `path` - A string slice that holds the GCS path.
12///
13/// # Returns
14///
15/// A tuple with the bucket name and the prefix.
16///
17/// # Panics
18///
19/// This function panics if the GCS path is invalid.
20#[must_use]
21pub fn gcs_split_path(path: &str) -> (String, String) {
22    let re = regex::Regex::new(r"^gs://").unwrap();
23    let path = re.replace(path, "");
24    let split: Vec<&str> = path.split('/').collect();
25
26    let bucket_name = split[0].to_string();
27    let prefix = split[1..].join("/");
28
29    (bucket_name, prefix)
30}
31/// List all files in a GCS path.
32/// The GCS path should be in the format `gs://bucket_name/prefix`.
33///
34/// # Arguments
35///
36/// * `path` - A string slice that holds the GCS path.
37///
38/// # Returns
39///
40/// A vector of `ObjectList` objects representing the files in the GCS path.
41///
42/// # Errors
43///
44/// This function returns an error if the GCS client cannot be created or the object list cannot be read.
45///
46/// # Panics
47///
48/// This function panics if the GCS path is invalid.
49pub fn gcs_list_files(path: &str) -> Result<Vec<ObjectList>> {
50    let (bucket_name, prefix) = gcs_split_path(path);
51
52    let client = Client::new()?;
53    let file_list = client.object().list(
54        &bucket_name,
55        ListRequest {
56            prefix: Some(prefix),
57            ..Default::default()
58        },
59    )?;
60
61    Ok(file_list)
62}
63
64/// Get the update time of a file in GCS.
65/// The GCS path should be in the format `gs://bucket_name/prefix`.
66///
67/// # Arguments
68///
69/// * `path` - A string slice that holds the GCS path.
70///
71/// # Returns
72///
73/// A `DateTime<Utc>` object representing the update time of the file.
74///
75/// # Errors
76///
77/// This function returns an error if the GCS client cannot be created or the object cannot be read.
78///
79/// # Panics
80///
81/// This function panics if the GCS path is invalid.
82pub fn gcs_get_file_update_time(path: &str) -> Result<DateTime<Utc>> {
83    let (bucket_name, prefix) = gcs_split_path(path);
84
85    let client = Client::new()?;
86    let object = client.object().read(&bucket_name, &prefix)?;
87
88    Ok(object.updated)
89}
90
91/// Download a file from GCS and return the local filename.
92/// The GCS path should be in the format `gs://bucket_name/prefix`.
93///
94/// # Arguments
95///
96/// * `path` - A string slice that holds the GCS path.
97///
98/// # Returns
99///
100/// A string with the local filename.
101///
102/// # Errors
103///
104/// This function returns an error if the GCS client cannot be created
105/// or the object cannot be downloaded.
106///
107/// # Panics
108///
109/// This function panics if the GCS path is invalid.
110pub fn gcs_download_file(path: &str) -> Result<String> {
111    let (bucket_name, prefix) = gcs_split_path(&path);
112    let filename = prefix.split('/').last().unwrap_or_default().to_string();
113
114    if !std::path::Path::new(&filename).exists() {
115        let client = Client::new().unwrap();
116        let bytes = client.object().download(&bucket_name, &prefix).unwrap();
117
118        std::fs::write(&filename, bytes)?;
119    }
120
121    Ok(filename)
122}
123
124/// List all files in a GCS path with a specific suffix.
125///
126/// # Arguments
127///
128/// * `path` - A string slice that holds the GCS path.
129/// * `suffix` - A string slice that holds the suffix to filter the files.
130///
131/// # Returns
132///
133/// A vector of strings with the names of the files that match the suffix.
134///
135/// # Errors
136///
137/// This function returns an error if the GCS client cannot be created or the object cannot be read.
138///
139/// # Panics
140///
141/// This function panics if the GCS path is invalid.
142pub fn gcs_list_files_of_type(path: &str, suffix: &str) -> Result<Vec<String>> {
143    let file_list = gcs_list_files(&path).unwrap();
144
145    let bam_files: Vec<_> = file_list
146        .iter()
147        .flat_map(|fs| {
148            fs.items
149                .iter()
150                .filter_map(|f| {
151                    if f.name.ends_with(suffix) {
152                        Some(f.name.clone())
153                    } else {
154                        None
155                    }
156                })
157                .collect::<Vec<_>>()
158        })
159        .collect();
160
161    Ok(bam_files)
162}