implement parsing human-readable names from URLs

Based off of commit 257b768436a0e8ab7887f9b790c5b92a7fe51ef5 Upstream-PR: https://github.com/NixOS/nix/pull/8678 Co-authored-by: Felix Uhl <felix.uhl@outlook.com> Change-Id: Idcb7f6191ca3310ef9dc854197f7798260c3f71d
author: Qyriad <qyriad@qyriad.me> 2024-04-30 18:11:14 -0600
committer: Qyriad <qyriad@qyriad.me> 2024-04-30 18:11:14 -0600
commit: 1425aa0b7cd0d3477589f75bea4fb9c74e057fed (patch)
tree: 80bb09919acabf760bd948a29901b2a67d409322 /src
parent: e2ab89a74b1d6044cea91e91f5c3d5fce203c2e8 (diff)
5 files changed, 89 insertions, 1 deletions
diff --git a/src/libutil/meson.build b/src/libutil/meson.build
index 069798a6f..8caa0532a 100644
--- a/src/libutil/meson.build
+++ b/src/libutil/meson.build
@@ -31,6 +31,7 @@ libutil_sources = files(
   'tarfile.cc',
   'thread-pool.cc',
   'url.cc',
+  'url-name.cc',
   'util.cc',
   'xml-writer.cc',
 )
@@ -92,6 +93,7 @@ libutil_headers = files(
   'topo-sort.hh',
   'types.hh',
   'url-parts.hh',
+  'url-name.hh',
   'url.hh',
   'util.hh',
   'variant-wrapper.hh',
diff --git a/src/libutil/url-name.cc b/src/libutil/url-name.cc
new file mode 100644
index 000000000..6ef58c80a
--- /dev/null
+++ b/src/libutil/url-name.cc
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <regex>
+
+#include "url-name.hh"
+
+namespace nix {
+
+static std::string const attributeNamePattern("[a-z0-9_-]+");
+static std::regex const lastAttributeRegex("(?:" + attributeNamePattern + "\\.)*(?!default)(" + attributeNamePattern +")(\\^.*)?");
+static std::string const pathSegmentPattern("[a-zA-Z0-9_-]+");
+static std::regex const lastPathSegmentRegex(".*/(" + pathSegmentPattern +")");
+static std::regex const secondPathSegmentRegex("(?:" + pathSegmentPattern + ")/(" + pathSegmentPattern +")(?:/.*)?");
+static std::regex const gitProviderRegex("github|gitlab|sourcehut");
+static std::regex const gitSchemeRegex("git($|\\+.*)");
+static std::regex const defaultOutputRegex(".*\\.default($|\\^.*)");
+
+std::optional<std::string> getNameFromURL(ParsedURL const & url)
+{
+    std::smatch match;
+
+    /* If there is a dir= argument, use its value */
+    if (url.query.count("dir") > 0) {
+        return url.query.at("dir");
+    }
+
+    /* If the fragment isn't a "default" and contains two attribute elements, use the last one */
+    if (std::regex_match(url.fragment, match, lastAttributeRegex)) {
+        return match.str(1);
+    }
+
+    /* If this is a github/gitlab/sourcehut flake, use the repo name */
+    if (
+        std::regex_match(url.scheme, gitProviderRegex)
+        && std::regex_match(url.path, match, secondPathSegmentRegex)
+    ) {
+        return match.str(1);
+    }
+
+    /* If it is a regular git flake, use the directory name */
+    if (
+        std::regex_match(url.scheme, gitSchemeRegex)
+        && std::regex_match(url.path, match, lastPathSegmentRegex)
+    ) {
+        return match.str(1);
+    }
+
+    /* If everything failed but there is a non-default fragment, use it in full */
+    if (!url.fragment.empty() && !std::regex_match(url.fragment, defaultOutputRegex))
+        return url.fragment;
+
+    /* If there is no fragment, take the last element of the path */
+    if (std::regex_match(url.path, match, lastPathSegmentRegex))
+        return match.str(1);
+
+    /* If even that didn't work, the URL does not contain enough info to determine a useful name */
+    return {};
+}
+
+}
diff --git a/src/libutil/url-name.hh b/src/libutil/url-name.hh
new file mode 100644
index 000000000..3a3f88e76
--- /dev/null
+++ b/src/libutil/url-name.hh
@@ -0,0 +1,26 @@
+#pragma once
+///@file url-name.hh, for some hueristic-ish URL parsing.
+
+#include <string>
+#include <optional>
+
+#include "url.hh"
+#include "url-parts.hh"
+#include "util.hh"
+#include "split.hh"
+
+namespace nix {
+
+/**
+ * Try to extract a reasonably unique and meaningful, human-readable
+ * name of a flake output from a parsed URL.
+ * When nullopt is returned, the callsite should use information available
+ * to it outside of the URL to determine a useful name.
+ * This is a heuristic approach intended for user interfaces.
+ * @return nullopt if the extracted name is not useful to identify a
+ * flake output, for example because it is empty or "default".
+ * Otherwise returns the extracted name.
+ */
+std::optional<std::string> getNameFromURL(ParsedURL const & url);
+
+}
diff --git a/src/libutil/url-parts.hh b/src/libutil/url-parts.hh
index 6255c1d02..6efcc7e50 100644
--- a/src/libutil/url-parts.hh
+++ b/src/libutil/url-parts.hh
@@ -19,6 +19,7 @@ const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncod
 const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
 const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
 const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
+const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
 const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
 const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
 const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";
diff --git a/src/libutil/url.cc b/src/libutil/url.cc
index a8f7d39fd..afccc4245 100644
--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@@ -16,7 +16,7 @@ ParsedURL parseURL(const std::string & url)
         "((" + schemeRegex + "):"
         + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex + ")))"
         + "(?:\\?(" + queryRegex + "))?"
-        + "(?:#(" + queryRegex + "))?",
+        + "(?:#(" + fragmentRegex + "))?",
         std::regex::ECMAScript);
 
     std::smatch match;
author	Qyriad <qyriad@qyriad.me>	2024-04-30 18:11:14 -0600
committer	Qyriad <qyriad@qyriad.me>	2024-04-30 18:11:14 -0600
commit	1425aa0b7cd0d3477589f75bea4fb9c74e057fed (patch)
tree	80bb09919acabf760bd948a29901b2a67d409322 /src
parent	e2ab89a74b1d6044cea91e91f5c3d5fce203c2e8 (diff)