This resembles XML data. And so i would approach it as such.
It is widely known that regex is not good for structured data like mark up languages.
So I suggest leveraging XDocument for that task:
string SanitizeXmlLikeString(string input, bool writeFullClosingNode = true)
{
const string openRootNode = "<root>";
const string closeRootNode = "</root>";
try
{
var xDoc = XDocument.Parse($"{openRootNode}{rawXml}{closeRootNode}");
// Remove any content that is in "value" nodes
foreach (var valueNode in xDoc.Descendants("value"))
{
valueNode.RemoveAll();
}
using var memoryStream = new MemoryStream();
using var xmlWriter = writeFullClosingNode
? new FullElementXmlTextWriter(memoryStream, Encoding.UTF8)
: new XmlTextWriter(memoryStream, Encoding.UTF8);
xDoc.Root.WriteTo(xmlWriter);
xmlWriter.Flush();
memoryStream.Seek(0, SeekOrigin.Begin);
var cleanedRawXml = Encoding.UTF8.GetString(memoryStream.ToArray());
cleanedRawXml = cleanedRawXml
.Trim('?') // serializer puts ? in front, so we don't want it
.Trim() // before removing root node, trim any surrounding spaces
[(openRootNode.Length + 1)..^closeRootNode.Length]; // take substring to remove opening and closing root markup
return cleanedRawXml;
}
catch
{
// in case of problems return original string
return input;
}
}
Only thing to mention besides inline comments is that XmlTextWriter writes empty nodes as <value /> - self closing, which might be not what you want. For that we can use approach suggested in this SO post to write full element <value></value>, even when empty:
public class FullElementXmlTextWriter : XmlTextWriter
{
public FullElementXmlTextWriter(TextWriter w) : base(w) { }
public FullElementXmlTextWriter(Stream w, Encoding encoding) : base(w, encoding) { }
public FullElementXmlTextWriter(string filename, Encoding encoding) : base(filename, encoding) { }
public override void WriteEndElement()
{
base.WriteFullEndElement();
}
}
string pattern = @"(?<=<value>).+?(?=<\/value>)"