many changes ;-)

Former-commit-id: cb4aea844e960e4af4dc673ebc8eec49a7d12b13
This commit is contained in:
2023-12-05 15:28:29 +01:00
parent 03bef6461d
commit 0f8066d367
39 changed files with 951797 additions and 198 deletions
+198
View File
@@ -0,0 +1,198 @@
CIRA_Arth03 CIRA001_A acacacac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01A;
CIRA_Arth03 CIRA002_A acagcaca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01B;
CIRA_Arth03 CIRA003_A gtgtacat:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01C;
CIRA_Arth03 CIRA004_A tatgtcag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01D;
CIRA_Arth03 CIRA005_A tagtcgca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01E;
CIRA_Arth03 CIRA006_A tactatac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01F;
CIRA_Arth03 CIRA007_A actagatc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01G;
CIRA_Arth03 BLNK001 gatcgcga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01H;
CIRA_Arth03 CIRA008_A acacacac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02A;
CIRA_Arth03 CIRA009_A acagcaca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02B;
CIRA_Arth03 CIRA010_A gtgtacat:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02C;
CIRA_Arth03 CPCR01_A tatgtcag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02D;
CIRA_Arth03 CIRA011_A tagtcgca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02E;
CIRA_Arth03 CIRA012_A tactatac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02F;
CIRA_Arth03 BLNK002 actagatc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02G;
CIRA_Arth03 CIRA013_A gatcgcga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02H;
CIRA_Arth03 CIRA014_A acacacac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03A;
CIRA_Arth03 CIRA015_A acagcaca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03B;
CIRA_Arth03 CIRA016_A gtgtacat:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03C;
CIRA_Arth03 CIRA017_A tatgtcag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03D;
CIRA_Arth03 CIRA018_A tagtcgca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03E;
CIRA_Arth03 BLNK003 tactatac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03F;
CIRA_Arth03 CIRA019_A actagatc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03G;
CIRA_Arth03 CIRA020_A gatcgcga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03H;
CIRA_Arth03 CIRA021_A acacacac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04A;
CIRA_Arth03 CIRA022_A acagcaca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04B;
CIRA_Arth03 CIRA023_A gtgtacat:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04C;
CIRA_Arth03 CIRA024_A tatgtcag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04D;
CIRA_Arth03 BLNK004 tagtcgca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04E;
CIRA_Arth03 CIRA025_A tactatac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04F;
CIRA_Arth03 CIRA026_A actagatc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04G;
CIRA_Arth03 CIRA027_A gatcgcga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04H;
CIRA_Arth03 CPOS232_A acacacac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05A;
CIRA_Arth03 CIRA028_A acagcaca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05B;
CIRA_Arth03 CIRA029_A gtgtacat:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05C;
CIRA_Arth03 BLNK005 tatgtcag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05D;
CIRA_Arth03 CIRA030_A tagtcgca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05E;
CIRA_Arth03 CIRA031_A tactatac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05F;
CIRA_Arth03 CIRA032_A actagatc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05G;
CIRA_Arth03 CIRA033_A gatcgcga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05H;
CIRA_Arth03 CIRA034_A acacacac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06A;
CIRA_Arth03 CIRA035_A acagcaca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06B;
CIRA_Arth03 BLNK006 gtgtacat:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06C;
CIRA_Arth03 CIRA036_A tatgtcag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06D;
CIRA_Arth03 CIRA037_A tagtcgca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06E;
CIRA_Arth03 CIRA038_A tactatac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06F;
CIRA_Arth03 CIRA039_A actagatc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06G;
CIRA_Arth03 CIRA040_A gatcgcga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06H;
CIRA_Arth03 CIRA041_A acacacac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07A;
CIRA_Arth03 BLNK007 acagcaca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07B;
CIRA_Arth03 CIRA042_A gtgtacat:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07C;
CIRA_Arth03 CIRA043_A tatgtcag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07D;
CIRA_Arth03 CIRA044_A tagtcgca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07E;
CIRA_Arth03 CIRA045_A tactatac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07F;
CIRA_Arth03 CIRA046_A actagatc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07G;
CIRA_Arth03 CIRA047_A gatcgcga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07H;
CIRA_Arth03 BLNK008 acacacac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08A;
CIRA_Arth03 CIRA048_A acagcaca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08B;
CIRA_Arth03 CIRA049_A gtgtacat:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08C;
CIRA_Arth03 CIRA050_A tatgtcag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08D;
CIRA_Arth03 CIRA051_A tagtcgca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08E;
CIRA_Arth03 CPCR02_A tactatac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08F;
CIRA_Arth03 CIRA052_A actagatc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08G;
CIRA_Arth03 CIRA053_A gatcgcga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08H;
CIRA_Arth03 CIRA054_A acacacac:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09A;
CIRA_Arth03 CPOS241_A acagcaca:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09B;
CIRA_Arth03 CIRA001_B cgctctcg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01A;
CIRA_Arth03 CIRA002_B gtcgtaga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01B;
CIRA_Arth03 CIRA003_B gtcacgtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01C;
CIRA_Arth03 CIRA004_B gactgatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01D;
CIRA_Arth03 CIRA005_B agactatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01E;
CIRA_Arth03 CIRA006_B gcgtcagc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01F;
CIRA_Arth03 CIRA007_B tgacatca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01G;
CIRA_Arth03 BLNK009 acatgtgt:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01H;
CIRA_Arth03 CIRA008_B cgctctcg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02A;
CIRA_Arth03 CIRA009_B gtcgtaga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02B;
CIRA_Arth03 CIRA010_B gtcacgtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02C;
CIRA_Arth03 CPCR01_B gactgatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02D;
CIRA_Arth03 CIRA011_B agactatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02E;
CIRA_Arth03 CIRA012_B gcgtcagc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02F;
CIRA_Arth03 BLNK010 tgacatca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02G;
CIRA_Arth03 CIRA013_B acatgtgt:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02H;
CIRA_Arth03 CIRA014_B cgctctcg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03A;
CIRA_Arth03 CIRA015_B gtcgtaga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03B;
CIRA_Arth03 CIRA016_B gtcacgtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03C;
CIRA_Arth03 CIRA017_B gactgatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03D;
CIRA_Arth03 CIRA018_B agactatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03E;
CIRA_Arth03 BLNK011 gcgtcagc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03F;
CIRA_Arth03 CIRA019_B tgacatca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03G;
CIRA_Arth03 CIRA020_B acatgtgt:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03H;
CIRA_Arth03 CIRA021_B cgctctcg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04A;
CIRA_Arth03 CIRA022_B gtcgtaga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04B;
CIRA_Arth03 CIRA023_B gtcacgtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04C;
CIRA_Arth03 CIRA024_B gactgatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04D;
CIRA_Arth03 BLNK012 agactatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04E;
CIRA_Arth03 CIRA025_B gcgtcagc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04F;
CIRA_Arth03 CIRA026_B tgacatca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04G;
CIRA_Arth03 CIRA027_B acatgtgt:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04H;
CIRA_Arth03 CPOS232_B cgctctcg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05A;
CIRA_Arth03 CIRA028_B gtcgtaga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05B;
CIRA_Arth03 CIRA029_B gtcacgtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05C;
CIRA_Arth03 BLNK013 gactgatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05D;
CIRA_Arth03 CIRA030_B agactatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05E;
CIRA_Arth03 CIRA031_B gcgtcagc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05F;
CIRA_Arth03 CIRA032_B tgacatca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05G;
CIRA_Arth03 CIRA033_B acatgtgt:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05H;
CIRA_Arth03 CIRA034_B cgctctcg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06A;
CIRA_Arth03 CIRA035_B gtcgtaga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06B;
CIRA_Arth03 BLNK014 gtcacgtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06C;
CIRA_Arth03 CIRA036_B gactgatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06D;
CIRA_Arth03 CIRA037_B agactatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06E;
CIRA_Arth03 CIRA038_B gcgtcagc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06F;
CIRA_Arth03 CIRA039_B tgacatca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06G;
CIRA_Arth03 CIRA040_B acatgtgt:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06H;
CIRA_Arth03 CIRA041_B cgctctcg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07A;
CIRA_Arth03 BLNK015 gtcgtaga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07B;
CIRA_Arth03 CIRA042_B gtcacgtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07C;
CIRA_Arth03 CIRA043_B gactgatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07D;
CIRA_Arth03 CIRA044_B agactatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07E;
CIRA_Arth03 CIRA045_B gcgtcagc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07F;
CIRA_Arth03 CIRA046_B tgacatca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07G;
CIRA_Arth03 CIRA047_B acatgtgt:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07H;
CIRA_Arth03 BLNK016 cgctctcg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08A;
CIRA_Arth03 CIRA048_B gtcgtaga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08B;
CIRA_Arth03 CIRA049_B gtcacgtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08C;
CIRA_Arth03 CIRA050_B gactgatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08D;
CIRA_Arth03 CIRA051_B agactatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08E;
CIRA_Arth03 CPCR02_B gcgtcagc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08F;
CIRA_Arth03 CIRA052_B tgacatca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08G;
CIRA_Arth03 CIRA053_B acatgtgt:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08H;
CIRA_Arth03 CIRA054_B cgctctcg:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09A;
CIRA_Arth03 CPOS241_B gtcgtaga:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09B;
CIRA_Arth03 CIRA002_C atgatcgc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01B;
CIRA_Arth03 CIRA003_C acgacgag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01C;
CIRA_Arth03 CIRA004_C catcagtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01D;
CIRA_Arth03 CIRA005_C atcagtca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01E;
CIRA_Arth03 CIRA006_C tctactga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01F;
CIRA_Arth03 CIRA007_C gatgatct:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01G;
CIRA_Arth03 CIRA009_C atgatcgc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02B;
CIRA_Arth03 CIRA010_C acgacgag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02C;
CIRA_Arth03 CPCR01_C catcagtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02D;
CIRA_Arth03 CIRA011_C atcagtca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02E;
CIRA_Arth03 CIRA012_C tctactga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02F;
CIRA_Arth03 BLNK018 gatgatct:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02G;
CIRA_Arth03 CIRA015_C atgatcgc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03B;
CIRA_Arth03 CIRA016_C acgacgag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03C;
CIRA_Arth03 CIRA017_C catcagtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03D;
CIRA_Arth03 CIRA018_C atcagtca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03E;
CIRA_Arth03 BLNK019 tctactga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03F;
CIRA_Arth03 CIRA019_C gatgatct:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03G;
CIRA_Arth03 CIRA022_C atgatcgc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04B;
CIRA_Arth03 CIRA023_C acgacgag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04C;
CIRA_Arth03 CIRA024_C catcagtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04D;
CIRA_Arth03 BLNK020 atcagtca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04E;
CIRA_Arth03 CIRA025_C tctactga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04F;
CIRA_Arth03 CIRA026_C gatgatct:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04G;
CIRA_Arth03 CIRA028_C atgatcgc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05B;
CIRA_Arth03 CIRA029_C acgacgag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05C;
CIRA_Arth03 BLNK021 catcagtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05D;
CIRA_Arth03 CIRA030_C atcagtca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05E;
CIRA_Arth03 CIRA031_C tctactga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05F;
CIRA_Arth03 CIRA032_C gatgatct:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05G;
CIRA_Arth03 CIRA035_C atgatcgc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06B;
CIRA_Arth03 BLNK022 acgacgag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06C;
CIRA_Arth03 CIRA036_C catcagtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06D;
CIRA_Arth03 CIRA037_C atcagtca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06E;
CIRA_Arth03 CIRA038_C tctactga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06F;
CIRA_Arth03 CIRA039_C gatgatct:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06G;
CIRA_Arth03 BLNK023 atgatcgc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07B;
CIRA_Arth03 CIRA042_C acgacgag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07C;
CIRA_Arth03 CIRA043_C catcagtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07D;
CIRA_Arth03 CIRA044_C atcagtca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07E;
CIRA_Arth03 CIRA045_C tctactga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07F;
CIRA_Arth03 CIRA046_C gatgatct:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07G;
CIRA_Arth03 CIRA048_C atgatcgc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08B;
CIRA_Arth03 CIRA049_C acgacgag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08C;
CIRA_Arth03 CIRA050_C catcagtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08D;
CIRA_Arth03 CIRA051_C atcagtca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08E;
CIRA_Arth03 CPCR02_C tctactga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08F;
CIRA_Arth03 CIRA052_C gatgatct:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08G;
CIRA_Arth03 CPOS241_C atgatcgc:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_09B;
CIRA_Arth03 CIRA001_C acacacac:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01A;
CIRA_Arth03 BLNK017 gatcgcga:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01H;
CIRA_Arth03 CIRA008_C acacacac:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02A;
CIRA_Arth03 CIRA013_C gatcgcga:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02H;
CIRA_Arth03 CIRA014_C acacacac:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03A;
CIRA_Arth03 CIRA020_C gatcgcga:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03H;
CIRA_Arth03 CIRA021_C acacacac:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04A;
CIRA_Arth03 CIRA027_C gatcgcga:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04H;
CIRA_Arth03 CPOS232_C acacacac:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05A;
CIRA_Arth03 CIRA033_C gatcgcga:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05H;
CIRA_Arth03 CIRA034_C acacacac:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06A;
CIRA_Arth03 CIRA040_C gatcgcga:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06H;
CIRA_Arth03 CIRA041_C acacacac:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07A;
CIRA_Arth03 CIRA047_C gatcgcga:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07H;
CIRA_Arth03 BLNK024 acacacac:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08A;
CIRA_Arth03 CIRA053_C gatcgcga:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08H;
CIRA_Arth03 CIRA054_C acacacac:atcagtca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_09A;
+409337
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -0,0 +1 @@
412536492810152d7835808871ea2b0289a770f7
BIN
View File
Binary file not shown.
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
+6
View File
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
+118
View File
@@ -0,0 +1,118 @@
#!/bin/bash
#!/bin/bash
#OAR -n gbsort
##OAR --array 50
##OAR --array-param-file 50_first.tsv
#OAR --project phyloalps
#OAR -l nodes=1/core=10,walltime=24:00:00
#OAR -O gbsort.%jobid%.log
#OAR -E gbsort.%jobid%.log
# /silenus/PROJECTS/pr-phyloalps/coissac
# /bettik/LECA/ENVIRONMENT/data/biodatabase/genbank
#
# Used resources URLs
#
NCBIURL="https://ftp.ncbi.nlm.nih.gov/" # NCBI Web site URL
GBURL="${NCBIURL}genbank/" # Directory of Genbank flat files
TAXOURL="${NCBIURL}pub/taxonomy/taxdump.tar.gz" # NCBI Taxdump
LOGFILE="download.log"
#
# List of downloaded Genbank divisions
#
DIV="bct|inv|mam|phg|pln|pri|rod|vrl|vrt"
############################
#
# Functions
#
############################
pattern_at_rank() {
local taxo="$1"
local rank="$2"
echo "^($(awk -F "|" -v rank="$rank" 'BEGIN {
ORS="|";
rank="\t" rank "\t"
}
($3 ~ rank) {sub(/^[ \t]+/,"",$1);
sub(/[ \t]+$/,"",$1);
print $1}
' "${taxo}/nodes.dmp" \
| sed 's/|$//'))$"
}
GBDIR=$1
#
# Extrate from the web site the current Genbank release number
# end create the corresponding directory
#
echo "Looking at current Genbank release number"
GB_Release_Number=$(for r in $(ls -d "${GBDIR}/Release-"* ); do
basename $r;
done \
| sort -r \
| head -1 \
| sed 's/^Release-//')
GB_Release_Number=251
echo "identified latest release number is : ${GB_Release_Number}"
GBSOURCE="${GBDIR}/Release-${GB_Release_Number}"
mkdir -p "Release-${GB_Release_Number}"
cd "Release-${GB_Release_Number}" || exit
#
# Download the current NCBI taxonomy
#
mkdir -p "ncbitaxo"
if [[ ! -f ncbitaxo/nodes.dmp ]] || [[ ! -f ncbitaxo/names.dmp ]] ; then
curl "${TAXOURL}" \
| tar -C "ncbitaxo" -zxf -
fi
for f in $(ls -1 "${GBSOURCE}/"*.seq.gz ) ; do
echo "PROCESSING : $f saved into $fasta" $(pwd)
obiannotate --genbank -t ncbitaxo \
--with-taxon-at-rank kingdom \
--with-taxon-at-rank superkingdom \
--with-taxon-at-rank phylum\
--with-taxon-at-rank order \
--with-taxon-at-rank family \
--with-taxon-at-rank genus \
-S division='"misc-@-0"' \
-S section='"misc-@-0"' \
"$f" \
| obigrep -A genus_taxid -A family_taxid \
| obigrep -p 'annotations.genus_taxid > 0 && annotations.family_taxid > 0' \
-p 'annotations.phylum_taxid > 0 || annotations.order_taxid > 0' \
| obiannotate -p 'annotations.superkingdom_taxid > 0' \
-S division='printf("%s-S-%d",subspc(annotations.superkingdom_name),annotations.superkingdom_taxid)' \
| obiannotate -p 'annotations.kingdom_taxid > 0' \
-S division='printf("%s-K-%d",subspc(annotations.kingdom_name),annotations.kingdom_taxid)' \
| obiannotate -p 'annotations.phylum_taxid > 0' \
-S section='printf("%s-P-%d",subspc(annotations.phylum_name),annotations.phylum_taxid)' \
| obiannotate -p 'annotations.order_taxid > 0' \
-S section='printf("%s-O-%d",subspc(annotations.order_name),annotations.order_taxid)' \
| obidistribute -Z -A -p "%s.fasta" -c section -d division
done
+31
View File
@@ -0,0 +1,31 @@
FROM ubuntu:lunar as builder
LABEL dockerfile.version="1"
LABEL software="obitools4"
WORKDIR /
RUN apt update --fix-missing && apt upgrade -y
RUN apt install -y build-essential
RUN apt install -y git tcsh bash gawk parallel gettext zlib1g-dev libglib2.0-0
# RUN git clone https://git.metabarcoding.org/org-asm/org-annotate.git
# RUN cd org-annotate/src && make && cd ../..
# RUN cd /org-annotate/data/its/ITSx_db/HMMs && \
# rm *.h3* && \
# for f in *.hmm ; do /org-annotate/ports/i386-linux/bin/hmmpress $f ; done
RUN apt install -y python3-pip python3-dev python3-venv
RUN git clone https://git.metabarcoding.org/org-asm/org-asm.git
RUN cd org-asm
RUN python3 -m venv ../org-assembler
RUN bash -c '../org-assembler/bin/pip3 install -r /org-asm/requirements.txt'
RUN bash -c '../org-assembler/bin/python3 setup.py install --no-serenity'
RUN cd ..
# FROM ubuntu:lunar as phyloskims
# WORKDIR /
# RUN apt update --fix-missing && apt upgrade -y
# RUN apt install -y tcsh bash gawk parallel zlib1g libglib2.0-0
# COPY --from=builder /org-annotate /org-annotate
# RUN rm -rf /org-annotate/src
# COPY --from=builder /org-assembler /org-assembler
# RUN mkdir -p /data
@@ -0,0 +1,2 @@
package obiformats
+314 -198
View File
@@ -3,93 +3,190 @@ package obistats
import (
"math"
"sync"
"time"
"golang.org/x/exp/rand"
"gonum.org/v1/gonum/stat/sampleuv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
//
// Parameters:
// - data: a 2D slice of float64 representing the data points to be assigned.
// - centers: a 2D slice of float64 representing the center points for each class.
//
// Return:
// - classes: a slice of int representing the assigned class for each data point.
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
classes := make([]int, len(*data))
numData := len(*data)
numCenters := len(*centers)
var wg sync.WaitGroup
wg.Add(numData)
for i := 0; i < numData; i++ {
go func(i int) {
defer wg.Done()
minDist := math.MaxFloat64
minDistIndex := -1
rowData := (*data)[i]
for j := 0; j < numCenters; j++ {
centerData := (*centers)[j]
dist := 0.0
for d, val := range rowData {
diff := val - centerData[d]
dist += diff * diff
}
if dist < minDist {
minDist = dist
minDistIndex = j
}
}
classes[i] = minDistIndex
}(i)
func squareDist(a, b []float64) float64 {
sum := 0.0
for i := 0; i < len(a); i++ {
diff := a[i] - b[i]
sum += diff * diff
}
wg.Wait()
return classes
return sum
}
// ComputeCenters calculates the centers of clusters for a given data set.
//
// Parameters:
// - data: a pointer to a matrix of float64 values representing the data set.
// - k: an integer representing the number of clusters.
// - classes: a slice of integers representing the assigned cluster for each data point.
//
// Returns:
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
// ComputeCenters calculates the centers of clusters for a given data set.
//
// Parameters:
// - data: a pointer to a matrix of float64 values representing the data set.
// - k: an integer representing the number of clusters.
// - classes: a slice of integers representing the assigned cluster for each data point.
//
// Returns:
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
centers := obiutils.Make2DNumericArray[float64](k, len((*data)[0]), true)
ns := make([]int, k)
func DefaultRG() *rand.Rand {
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
}
var wg sync.WaitGroup
type KmeansClustering struct {
data *obiutils.Matrix[float64]
rg *rand.Rand
centers obiutils.Matrix[float64]
icenters []int
sizes []int
distmin []float64
classes []int
}
for i := range ns {
ns[i] = 0
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
distmin := make([]float64, len(*data))
for i := 0; i < len(distmin); i++ {
distmin[i] = math.MaxFloat64
}
clustering := &KmeansClustering{
data: data,
icenters: make([]int, 0, k),
sizes: make([]int, 0, k),
centers: make(obiutils.Matrix[float64], 0, k),
distmin: distmin,
classes: make([]int, len(*data)),
rg: rg,
}
for i := 0; i < k; i++ {
clustering.AddACenter()
}
return clustering
}
// K returns the number of clusters in the K-means clustering algorithm.
//
// No parameters.
// Returns an integer.
func (clustering *KmeansClustering) K() int {
return len(clustering.icenters)
}
// N returns the size of the dataset in the KmeansClustering instance.
//
// It does not take any parameters.
// The return type is an integer.
func (clustering *KmeansClustering) N() int {
return len(*clustering.data)
}
// Dimension returns the dimension of the KmeansClustering data.
//
// No parameters.
// Returns an integer representing the dimension of the data.
func (clustering *KmeansClustering) Dimension() int {
return len((*clustering.data)[0])
}
func (clustering *KmeansClustering) AddACenter() {
C := 0
if clustering.K() == 0 {
C = rand.Intn(clustering.N())
} else {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ = w.Take()
}
clustering.icenters = append(clustering.icenters, C)
clustering.sizes = append(clustering.sizes, 0)
center := (*clustering.data)[C]
clustering.centers = append(clustering.centers, center)
n := clustering.N()
for i := 0; i < n; i++ {
d := squareDist((*clustering.data)[i], center)
if d < clustering.distmin[i] {
clustering.distmin[i] = d
}
}
}
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
//
// It iterates over the centers and checks if their corresponding sizes are zero.
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
// Finally, the function returns the number of empty centers that were reset.
func (clustering *KmeansClustering) ResetEmptyCenters() int {
nreset := 0
for i := 0; i < clustering.K(); i++ {
if clustering.sizes[i] == 0 {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ := w.Take()
clustering.icenters[i] = C
clustering.centers[i] = (*clustering.data)[C]
nreset++
}
}
return nreset
}
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
//
// This function does not take any parameters.
// It does not return anything.
func (clustering *KmeansClustering) AssignToClass() {
var wg sync.WaitGroup
var lock sync.Mutex
for i := 0; i < clustering.K(); i++ {
clustering.sizes[i] = 0
}
for i := 0; i < clustering.N(); i++ {
clustering.distmin[i] = math.MaxFloat64
}
goroutine := func(i int) {
defer wg.Done()
dmin := math.MaxFloat64
cmin := -1
for j, center := range clustering.centers {
dist := squareDist((*clustering.data)[i], center)
if dist < dmin {
dmin = dist
cmin = j
}
}
lock.Lock()
clustering.classes[i] = cmin
clustering.sizes[cmin]++
clustering.distmin[i] = dmin
lock.Unlock()
}
wg.Add(clustering.N())
for i := 0; i < clustering.N(); i++ {
go goroutine(i)
}
nreset := clustering.ResetEmptyCenters()
if nreset > 0 {
log.Warnf("Reset %d empty centers", nreset)
clustering.AssignToClass()
}
}
// ComputeCenters calculates the centers of the K-means clustering algorithm.
//
// It takes no parameters.
// It does not return any values.
func (clustering *KmeansClustering) ComputeCenters() {
var wg sync.WaitGroup
centers := clustering.centers
data := clustering.data
classes := clustering.classes
k := clustering.K()
// Goroutine code
goroutine := func(centerIdx int) {
goroutine1 := func(centerIdx int) {
defer wg.Done()
for j, row := range *data {
class := classes[j]
if class == centerIdx {
ns[centerIdx]++
for l, val := range row {
centers[centerIdx][l] += val
}
@@ -99,149 +196,168 @@ func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiut
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine(i)
go goroutine1(i)
}
wg.Wait()
for i := range centers {
for j := range centers[i] {
centers[i][j] /= float64(ns[i])
centers[i][j] /= float64(clustering.sizes[i])
}
}
return &centers
}
// ComputeInertia computes the inertia of the given data and centers in parallel.
//
// Parameters:
// - data: A pointer to a Matrix of float64 representing the data.
// - classes: A slice of int representing the class labels for each data point.
// - centers: A pointer to a Matrix of float64 representing the centers.
//
// Return type:
// - float64: The computed inertia.
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
inertia := make(chan float64)
numRows := len(*data)
wg := sync.WaitGroup{}
wg.Add(numRows)
for i := 0; i < numRows; i++ {
go func(i int) {
defer wg.Done()
row := (*data)[i]
class := classes[i]
center := (*centers)[class]
inertiaLocal := 0.0
for j, val := range row {
diff := val - center[j]
inertiaLocal += diff * diff
goroutine2 := func(centerIdx int) {
defer wg.Done()
dkmin := math.MaxFloat64
dki := -1
center := centers[centerIdx]
for j, row := range *data {
if classes[j] == centerIdx {
dist := squareDist(row, center)
if dist < dkmin {
dkmin = dist
dki = j
}
}
inertia <- inertiaLocal
}(i)
}
go func() {
wg.Wait()
close(inertia)
}()
totalInertia := 0.0
for localInertia := range inertia {
totalInertia += localInertia
}
return totalInertia
}
// Kmeans performs the K-means clustering algorithm on the given data.
//
// if centers and *center is not nil, centers is considered as initialized
// and the number of classes (k) is set to the number of rows in centers.
// overwise, the number of classes is defined by the value of k.
//
// Parameters:
// - data: A pointer to a Matrix[float64] that represents the input data.
// - k: An integer that specifies the number of clusters to create.
// - threshold: A float64 value that determines the convergence threshold.
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
//
// Returns:
// - classes: A slice of integers that assigns each data point to a cluster.
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
// - inertia: A float64 value that represents the overall inertia of the clustering.
// - converged: A boolean value indicating whether the algorithm converged.
func Kmeans(data *obiutils.Matrix[float64],
k int,
threshold float64,
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
if centers == nil || *centers == nil {
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
center_ids := SampleIntWithoutReplacement(k, len(*data))
for i, id := range center_ids {
(*centers)[i] = (*data)[id]
}
} else {
k = len(*centers)
clustering.icenters[centerIdx] = dki
clustering.centers[centerIdx] = (*data)[dki]
}
classes := AssignToClass(data, centers)
centers = ComputeCenters(data, k, classes)
inertia := ComputeInertia(data, classes, centers)
delta := threshold * 100.0
for i := 0; i < 100 && delta > threshold; i++ {
classes = AssignToClass(data, centers)
centers = ComputeCenters(data, k, classes)
newi := ComputeInertia(data, classes, centers)
delta = inertia - newi
inertia = newi
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
}
return classes, centers, inertia, delta < threshold
}
// KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
//
// It takes a matrix of data points and a matrix of centers as input.
// The best representative is the data point that is closest to the center of the cluster.
// Returns an array of integers containing the index of the best representative for each cluster.
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
bestRepresentative := make([]int, len(*centers))
var wg sync.WaitGroup
wg.Add(len(*centers))
for j, center := range *centers {
go func(j int, center []float64) {
defer wg.Done()
bestDistToCenter := math.MaxFloat64
best := -1
for i, row := range *data {
dist := 0.0
for d, val := range row {
diff := val - center[d]
dist += diff * diff
}
if dist < bestDistToCenter {
bestDistToCenter = dist
best = i
}
}
if best == -1 {
log.Fatalf("No representative found for cluster %d", j)
}
bestRepresentative[j] = best
}(j, center)
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine2(i)
}
wg.Wait()
return bestRepresentative
}
func (clustering *KmeansClustering) Inertia() float64 {
inertia := 0.0
for i := 0; i < clustering.N(); i++ {
inertia += clustering.distmin[i]
}
return inertia
}
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
return clustering.centers
}
func (clustering *KmeansClustering) CentersIndices() []int {
return clustering.icenters
}
func (clustering *KmeansClustering) Sizes() []int {
return clustering.sizes
}
func (clustering *KmeansClustering) Classes() []int {
return clustering.classes
}
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
prev := math.MaxFloat64
newI := clustering.Inertia()
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
prev = newI
clustering.AssignToClass()
clustering.ComputeCenters()
newI = clustering.Inertia()
}
return (prev - newI) <= threshold
}
// // Kmeans performs the K-means clustering algorithm on the given data.
// // if centers and *center is not nil, centers is considered as initialized
// // and the number of classes (k) is set to the number of rows in centers.
// // overwise, the number of classes is defined by the value of k.
// // Parameters:
// // - data: A pointer to a Matrix[float64] that represents the input data.
// // - k: An integer that specifies the number of clusters to create.
// // - threshold: A float64 value that determines the convergence threshold.
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
// // Returns:
// // - classes: A slice of integers that assigns each data point to a cluster.
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
// // - inertia: A float64 value that represents the overall inertia of the clustering.
// // - converged: A boolean value indicating whether the algorithm converged.
// func Kmeans(data *obiutils.Matrix[float64],
// k int,
// threshold float64,
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
// if centers == nil || *centers == nil {
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
// center_ids := SampleIntWithoutReplacement(k, len(*data))
// for i, id := range center_ids {
// (*centers)[i] = (*data)[id]
// }
// } else {
// k = len(*centers)
// }
// classes := AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// inertia := ComputeInertia(data, classes, centers)
// delta := threshold * 100.0
// for i := 0; i < 100 && delta > threshold; i++ {
// classes = AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// newi := ComputeInertia(data, classes, centers)
// delta = inertia - newi
// inertia = newi
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
// }
// return classes, centers, inertia, delta < threshold
// }
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
// //
// // It takes a matrix of data points and a matrix of centers as input.
// // The best representative is the data point that is closest to the center of the cluster.
// // Returns an array of integers containing the index of the best representative for each cluster.
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
// bestRepresentative := make([]int, len(*centers))
// var wg sync.WaitGroup
// wg.Add(len(*centers))
// for j, center := range *centers {
// go func(j int, center []float64) {
// defer wg.Done()
// bestDistToCenter := math.MaxFloat64
// best := -1
// for i, row := range *data {
// dist := 0.0
// for d, val := range row {
// diff := val - center[d]
// dist += diff * diff
// }
// if dist < bestDistToCenter {
// bestDistToCenter = dist
// best = i
// }
// }
// if best == -1 {
// log.Fatalf("No representative found for cluster %d", j)
// }
// bestRepresentative[j] = best
// }(j, center)
// }
// wg.Wait()
// return bestRepresentative
// }
+64
View File
@@ -0,0 +1,64 @@
// obitable provide a row oriented data table structure
package obitable
import (
"reflect"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/chen3feng/stl4go"
)
type Header stl4go.Ordered
type Row map[string]interface{}
type Table struct {
ColType map[string]reflect.Type
Rows []Row
}
type RowGetter func(name string) interface{}
func RowFromMap(data map[string]interface{}, navalue string) RowGetter {
getter := func(name string) interface{} {
value, ok := data[name]
if !ok {
value = navalue
}
return value
}
return getter
}
func RowFromBioSeq(data *obiseq.BioSequence, navalue string) RowGetter {
getter := func(name string) interface{} {
var value interface{}
value = navalue
switch name {
case "id":
value = data.Id()
case "sequence":
value = data.Sequence()
case "definition":
value = data.Definition()
case "taxid":
value = data.Taxid()
case "count":
value = data.Count()
default:
if data.HasAnnotation() {
var ok bool
value, ok = data.GetAttribute(name)
if !ok {
value = navalue
}
}
}
return value
}
return getter
}
+3
View File
@@ -124,6 +124,9 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
}
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
_, centers, inertia, converged := classifier.Run(1000, 0.001)
intertia := classifier.Inertia()
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
dist_centers := 0.0
BIN
View File
Binary file not shown.
+11
View File
@@ -0,0 +1,11 @@
==> db_v05_idx_ori.fasta <==
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
cagcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+11
View File
@@ -0,0 +1,11 @@
==> db_v05_idx_ori.fasta <==
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
cagcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.
Binary file not shown.
BIN
View File
Binary file not shown.
View File
+15841
View File
File diff suppressed because it is too large Load Diff
+15841
View File
File diff suppressed because it is too large Load Diff
+5805
View File
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+537
View File
@@ -0,0 +1,537 @@
package main
import (
"bytes"
"fmt"
"io"
"log"
"regexp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/gabriel-vasile/mimetype"
"golang.org/x/exp/slices"
)
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
fastaDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^>[^ ]", raw)
return ok && err == nil
}
fastqDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^@[^ ]", raw)
return ok && err == nil
}
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
return ok
}
genbankDetector := func(raw []byte, limit uint32) bool {
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
return ok2 || (ok1 && err == nil)
}
emblDetector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("ID "))
return ok
}
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
// Create a buffer to store the read data
buf := make([]byte, 1024*128)
n, err := stream.Read(buf)
if err != nil && err != io.EOF {
return nil, nil, err
}
// Detect the MIME type using the mimetype library
mimeType := mimetype.Detect(buf)
if mimeType == nil {
return nil, nil, err
}
// Create a new reader based on the read data
newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream)
return mimeType, newReader, nil
}
var xxx1 = `00422_612GNAAXX:7:73:6614:3284#0/1
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
gaagtagtagaacaggctcctctagaagggt`
var xxx2 = `>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
gaagtagtagaacaggctcctctagaagggt`
var xxx3 = `00422_612GNAAXX:7:73:6614:3284#0/1
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg`
var yyy1 = `@HELIUM_000100422_612GNAAXX:7:1:9007:3289#0/1 {"demultiplex_error":"cannot assign the sequence to a sample"}
ccatctctcttagataccccactatgcttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaatagcttaaaactcaaagaactc
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCACCCCCCCCCBCCACC779?############################################
@HELIUM_000100422_612GNAAXX:7:1:8849:9880#0/1 {"demultiplex_error":"cannot match any primer pair"}
gatcggaagagcggttcagcaggaatgccgagaccgatatcgtatgccgtcttctgcttgaaaaaaaaaacaaaataggagagtagactcactgccagtggtcgtcag
`
func LastFastqCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
cut := imax
state := 0
restart := imax - 1
for i := restart; i >= 0 && state < 7; i-- {
C := buffer[i]
is_end_of_line := C == '\r' || C == '\n'
is_space := C == ' ' || C == '\t'
is_sep := is_space || is_end_of_line
switch state {
case 0:
if C == '+' {
// Potential start of quality part step 1
state = 1
restart = i
}
case 1:
if is_end_of_line {
// Potential start of quality part step 2
state = 2
} else {
// it was not the start of quality part
state = 0
i = restart
}
case 2:
if is_sep {
// Potential start of quality part step 2 (stay in the same state)
state = 2
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
// End of the sequence
state = 3
} else {
// it was not the start of quality part
state = 0
i = restart
}
case 3:
if is_end_of_line {
// Entrering in the header line
state = 4
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
// progressing along of the sequence
state = 3
} else {
// it was not the sequence part
state = 0
i = restart
}
case 4:
if is_end_of_line {
state = 4
} else {
state = 5
}
case 5:
if is_end_of_line {
// It was not the header line
state = 0
i = restart
} else if C == '@' {
state = 6
cut = i
}
case 6:
if is_end_of_line {
state = 7
} else {
state = 0
i = restart
}
}
}
if state == 7 {
return buffer[:cut], bytes.Clone(buffer[cut:])
}
return []byte{}, buffer
}
func LastSequenceCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
last := 0
state := 0
for i := imax - 1; i >= 0 && state < 2; i-- {
if state == 0 && buffer[i] == '>' {
state = 1
last = i
} else if state == 1 && (buffer[i] == '\r' || buffer[i] == '\n') {
state = 2
} else {
state = 0
}
}
if state == 2 {
return buffer[:last], bytes.Clone(buffer[last:])
}
return []byte{}, buffer
}
func FirstSequenceCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
last := 0
state := 0
for i := 0; i < imax && state < 2; i++ {
if (state == 0 || state == 1) && (buffer[i] == '\r' || buffer[i] == '\n') {
state = 1
} else if (state == 1 || i == 0) && buffer[i] == '>' {
state = 2
last = i
} else {
state = 0
}
}
if state == 2 {
return bytes.Clone(buffer[:last]), buffer[last:]
}
return buffer, []byte{}
}
func FullSequenceCut(buffer []byte) ([]byte, []byte, []byte) {
before, buffer := FirstSequenceCut(buffer)
if len(buffer) == 0 {
return before, []byte{}, []byte{}
}
buffer, after := LastSequenceCut(buffer)
return before, buffer, after
}
func Concatenate[S ~[]E, E any](s1, s2 S) S {
if len(s1) > 0 {
if len(s2) > 0 {
return append(s1[:len(s1):len(s1)], s2...)
}
return s1
}
return s2
}
type FastxChunk struct {
Bytes []byte
index int
}
func FastaChunkReader(r io.Reader, size int, cutHead bool) (chan FastxChunk, error) {
out := make(chan FastxChunk)
buff := make([]byte, size)
n, err := r.Read(buff)
if n > 0 && err == nil {
if n < size {
buff = buff[:n]
}
begin, buff := FirstSequenceCut(buff)
if len(begin) > 0 && !cutHead {
return out, fmt.Errorf("begin is not empty : %s", string(begin))
}
go func(buff []byte) {
idx := 0
end := []byte{}
for err == nil && n > 0 {
// fmt.Println("============end=========================")
// fmt.Println(string(end))
// fmt.Println("------------buff------------------------")
// fmt.Println(string(buff))
buff = Concatenate(end, buff)
// fmt.Println("------------buff--pasted----------------")
// fmt.Println(string(buff))
buff, end = LastSequenceCut(buff)
// fmt.Println("----------------buff--cutted------------")
// fmt.Println(string(buff))
// fmt.Println("------------------end-------------------")
// fmt.Println(string(end))
// fmt.Println("========================================")
if len(buff) > 0 {
out <- FastxChunk{
Bytes: bytes.Clone(buff),
index: idx,
}
idx++
} else {
fmt.Println("***** Empty buff *****")
}
buff = slices.Grow(buff[:0], size)[0:size]
n, err = r.Read(buff)
if n < size {
buff = buff[:n]
}
// fmt.Printf("n = %d, err = %v\n", n, err)
}
if len(end) > 0 {
out <- FastxChunk{
Bytes: bytes.Clone(end),
index: idx,
}
}
close(out)
}(buff)
}
return out, nil
}
func ParseFastaChunk(ch FastxChunk) *obiiter.BioSequenceBatch {
fmt.Println(string(ch.Bytes))
slice := make(obiseq.BioSequenceSlice, 0, obioptions.CLIBatchSize())
state := 0
start := 0
current := 0
var identifier string
var definition string
for i := 0; i < len(ch.Bytes); i++ {
C := ch.Bytes[i]
is_end_of_line := C == '\r' || C == '\n'
is_space := C == ' ' || C == '\t'
is_sep := is_space || is_end_of_line
switch state {
case 0:
if C == '>' {
// Beginning of sequence
state = 1
}
case 1:
if is_sep {
// No identifier -> ERROR
return nil
} else {
// Beginning of identifier
state = 2
start = i
}
case 2:
if is_sep {
// End of identifier
identifier = string(ch.Bytes[start:i])
state = 3
}
case 3:
if is_end_of_line {
// Definition empty
definition = ""
state = 5
} else if !is_space {
// Beginning of definition
start = i
state = 4
}
case 4:
if is_end_of_line {
definition = string(ch.Bytes[start:i])
state = 5
}
case 5:
if !is_end_of_line {
// Beginning of sequence
start = i
current = i
state = 6
}
case 6:
if C == '>' {
// End of sequence
s := obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition)
slice = append(slice, s)
state = 1
} else if !is_sep {
if C >= 'A' && C <= 'Z' {
C = C + 'a' - 'A'
}
// Removing white space from the sequence
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
ch.Bytes[current] = C
current++
}
}
}
}
fmt.Printf("Index = %d, State = %d\n", ch.index, state)
slice = append(slice, obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition))
batch := obiiter.MakeBioSequenceBatch(ch.index, slice)
return &batch
}
func ReadFastaSequence(reader io.Reader) obiiter.IBioSequence {
out := obiiter.MakeIBioSequence()
nworker := obioptions.CLIReadParallelWorkers()
out.Add(nworker)
chkchan, err := FastaChunkReader(reader, 1024*500, false)
if err != nil {
log.Panicln("Error:", err)
}
go func() {
out.WaitAndClose()
}()
parser := func() {
defer out.Done()
for chk := range chkchan {
seqs := ParseFastaChunk(chk)
if seqs != nil {
out.Push(*seqs)
}
}
}
for i := 0; i < nworker; i++ {
go parser()
}
return out.SortBatches().Rebatch(obioptions.CLIBatchSize())
}
func main() {
// if len(os.Args) != 2 {
// fmt.Println("Usage: go run main.go <filename>")
// return
// }
// filename := os.Args[1]
// filename := "100.fasta"
// file, err := os.Open(filename)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// defer file.Close()
// mimeType, input, err := OBIMimeTypeGuesser(file)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// fmt.Println("Detected MIME Type:", mimeType.String())
// ch, err := FastaChunkReader(input, 1024, false)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// for chk := range ch {
// fmt.Printf("--------------------\n")
// b := ParseFastaChunk(chk)
// fmt.Printf("-------- %d --------\n", b.Order())
// for _, b := range b.Slice() {
// fmt.Printf("--%s--\t--%s--\t--%s--\n", b.Id(), b.Definition(), b.String())
// }
// }
d1, f1 := LastFastqCut([]byte(yyy1))
// d2, f2 := LastSequenceCut([]byte(xxx2))
// d3, f3 := LastSequenceCut([]byte(xxx3))
fmt.Println("Last Sequence Cut 1:", string(d1), "---", string(f1))
// fmt.Println("Last Sequence Cut 2:", string(d2), "---", string(f2))
// fmt.Println("Last Sequence Cut 3:", string(d3), "---", string(f3))
// d1, b1, f1 := FullSequenceCut([]byte(xxx1))
// d2, b2, f2 := FullSequenceCut([]byte(xxx2))
// d3, b3, f3 := FullSequenceCut([]byte(xxx3))
// fmt.Println("Last Sequence Cut:", string(d1), "---", string(b1), "---", string(f1))
// fmt.Println("Last Sequence Cut:", string(d2), "---", string(b2), "---", string(f2))
// fmt.Println("Last Sequence Cut:", string(d3), "---", string(b3), "---", string(f3))
// Now you can use "extractedData" to access the read data with the associated MIME type.
// For example, you can copy the data into a buffer for further manipulation.
}
+1
View File
@@ -0,0 +1 @@
061a528427b8ecb0b30df8e7923edc4220443ade
+45277
View File
File diff suppressed because it is too large Load Diff
BIN
View File
Binary file not shown.
+9322
View File
File diff suppressed because it is too large Load Diff
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
+6
View File
@@ -0,0 +1,6 @@
id,HELIUM_000100422_612GNAAXX:7:100:4828:3492#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:113:17236:15166#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:16:12111:9453#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:17:3675:13316#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:20:15729:20493#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:22:2603:18023#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:25:11714:14251#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:2:15508:17530#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:30:17945:19531#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:14122:13731#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:17680:16952#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:52:12776:11698#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:53:17880:8617#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:58:11419:17203#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:5:15939:5437#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:66:4039:8016#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:69:15276:10367#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:72:17638:8081#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:18108:9040#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:2880:4021#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:80:10626:19388#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:81:18704:12346#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:84:16335:5083#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:94:16908:11285#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:9:9623:15395#0/1_sub[28..127]
15a_F730814,0,0,0,0,0,9165,0,0,0,0,0,0,0,5,0,0,0,0,4,0,0,0,0,0,0,0
13a_F730603,22,0,1,0,0,0,0,0,0,0,15,0,0,19,0,25,0,0,9,0,20,0,0,0,8409,0
29a_F260619,0,0,13,0,16,6139,0,0,0,0,0,0,0,1,0,0,25,0,0,44,0,0,391,110,0,353
26a_F040644,0,72,0,17,0,0,14,18,43,31,0,52,88,481,12830,0,0,15,0,0,0,208,0,14,0,0
1 id HELIUM_000100422_612GNAAXX:7:100:4828:3492#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:113:17236:15166#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:16:12111:9453#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:17:3675:13316#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:20:15729:20493#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:22:2603:18023#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:25:11714:14251#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:2:15508:17530#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:30:17945:19531#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:34:14122:13731#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:34:17680:16952#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:52:12776:11698#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:53:17880:8617#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:58:11419:17203#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:5:15939:5437#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:66:4039:8016#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:69:15276:10367#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:72:17638:8081#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:7:18108:9040#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:7:2880:4021#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:80:10626:19388#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:81:18704:12346#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:84:16335:5083#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:94:16908:11285#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:9:9623:15395#0/1_sub[28..127]
2 15a_F730814 0 0 0 0 0 9165 0 0 0 0 0 0 0 5 0 0 0 0 4 0 0 0 0 0 0 0
3 13a_F730603 22 0 1 0 0 0 0 0 0 0 15 0 0 19 0 25 0 0 9 0 20 0 0 0 8409 0
4 29a_F260619 0 0 13 0 16 6139 0 0 0 0 0 0 0 1 0 0 25 0 0 44 0 0 391 110 0 353
5 26a_F040644 0 72 0 17 0 0 14 18 43 31 0 52 88 481 12830 0 0 15 0 0 0 208 0 14 0 0
+4
View File
@@ -0,0 +1,4 @@
wolf_diet 13a_F730603 aattaac TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 13b_F730603 gaagtag TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 26a_F040644 gaatatc TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 29a_F260619 gcctcct TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
+181104
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -0,0 +1 @@
b92f63b34879105e61db0faf868f2828b4db298d
+77043
View File
File diff suppressed because it is too large Load Diff
+1860
View File
File diff suppressed because it is too large Load Diff
+1104
View File
File diff suppressed because it is too large Load Diff